diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..1eea4306 --- /dev/null +++ b/.gitignore @@ -0,0 +1,28 @@ + +minerd +minerd.exe +*.o + +autom4te.cache +.deps + +Makefile +Makefile.in +INSTALL +aclocal.m4 +configure +configure.lineno +depcomp +missing +install-sh +stamp-h1 +cpuminer-config.h* +compile +config.log +config.status +config.status.lineno +config.guess +config.sub + +mingw32-config.cache + diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 00000000..27853baf --- /dev/null +++ b/.travis.yml @@ -0,0 +1,9 @@ +language: c + +compiler: + - gcc + +before_script: + - ./autogen.sh + - ./configure + - make \ No newline at end of file diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 00000000..984b32c6 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,5 @@ +Jeff Garzik + +ArtForz + +pooler diff --git a/COPYING b/COPYING new file mode 100644 index 00000000..d60c31a9 --- /dev/null +++ b/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 00000000..326703b3 --- /dev/null +++ b/ChangeLog @@ -0,0 +1 @@ +See git repository ('git log') for full changelog. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..dc2101dd --- /dev/null +++ b/Dockerfile @@ -0,0 +1,25 @@ +# +# Dockerfile for cpuminer +# usage: docker run creack/cpuminer --url xxxx --user xxxx --pass xxxx +# ex: docker run creack/cpuminer --url stratum+tcp://ltc.pool.com:80 --user creack.worker1 --pass abcdef +# +# + +FROM ubuntu:12.10 +MAINTAINER Guillaume J. Charmes + +RUN apt-get update -qq + +RUN apt-get install -qqy automake +RUN apt-get install -qqy libcurl4-openssl-dev +RUN apt-get install -qqy git +RUN apt-get install -qqy make + +RUN git clone https://github.com/pooler/cpuminer + +RUN cd cpuminer && ./autogen.sh +RUN cd cpuminer && ./configure CFLAGS="-O3" +RUN cd cpuminer && make + +WORKDIR /cpuminer +ENTRYPOINT ["./minerd"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..c43e8b02 --- /dev/null +++ b/LICENSE @@ -0,0 +1,3 @@ +cpuminer is available under the terms of the GNU Public License version 2. + +See COPYING for details. diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 00000000..9f05f738 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,41 @@ + +if WANT_JANSSON +JANSSON_INCLUDES= -I$(top_srcdir)/compat/jansson +else +JANSSON_INCLUDES= +endif + +EXTRA_DIST = example-cfg.json nomacro.pl + +SUBDIRS = compat + +INCLUDES = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) + +bin_PROGRAMS = minerd + +dist_man_MANS = minerd.1 + +minerd_SOURCES = elist.h miner.h compat.h \ + cpu-miner.c util.c \ + sha2.c scrypt.c keccak.c \ + heavy.c quark.c \ + sha3/sph_keccak.c \ + sha3/sph_hefty1.c \ + sha3/sph_groestl.c \ + sha3/sph_skein.c \ + sha3/sph_bmw.c \ + sha3/sph_jh.c \ + sha3/sph_blake.c +if ARCH_x86 +minerd_SOURCES += sha2-x86.S scrypt-x86.S +endif +if ARCH_x86_64 +minerd_SOURCES += sha2-x64.S scrypt-x64.S +endif +if ARCH_ARM +minerd_SOURCES += sha2-arm.S scrypt-arm.S +endif +minerd_LDFLAGS = $(PTHREAD_FLAGS) +minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ -lcrypto +minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@ + diff --git a/NEWS b/NEWS new file mode 100644 index 00000000..a4ca024d --- /dev/null +++ b/NEWS @@ -0,0 +1,262 @@ +Version 2.3.3 - Feb 27, 2014 + +- The --url option is now mandatory +- Do not switch to Stratum when using an HTTP proxy +- Fix scheduling policy change on Linux (clbr) +- Fix CPU affinity on FreeBSD (ache) +- Compatibility fixes for various platforms, including Solaris 8 + and old versions of OS X +- A man page for minerd is now available + +Version 2.3.2 - Jul 10, 2013 + +- Add optimizations for AVX2-capable x86-64 processors +- Ensure that the output stream is flushed after every log message +- Fix an undefined-behavior bug in the Stratum code + +Version 2.3.1 - Jun 18, 2013 + +- Add a --cert option for specifying an SSL certificate (martinwguy) +- Fix a bug that only made SHA-256d mining work at difficulty 1 +- Fix a couple of compatibility issues with some Stratum servers + +Version 2.3 - Jun 12, 2013 + +- Add support for the Stratum mining protocol +- Automatically switch to Stratum if the mining server supports + the X-Stratum extension, unless --no-stratum is used +- Set CPU affinity on FreeBSD (lye) +- Fix a bug in libcurl initialization (martinwguy) + +Version 2.2.3 - Aug 5, 2012 + +- Add optimized ARM NEON code for scrypt and SHA-256d +- Add a --benchmark option that allows offline testing +- Support for the X-Reject-Reason extension + +Version 2.2.2 - Jun 7, 2012 + +- Various performance improvements for x86 and x86-64 +- Optimize scrypt for ARMv5E and later processors +- Set the priority of miner threads to idle on Windows +- Add an option to start minerd as a daemon on POSIX systems + +Version 2.2.1 - May 2, 2012 + +- Add optimized code for ARM processors +- Support for building on NetBSD and OpenBSD +- Various compatibility fixes for AIX (pontius) + +Version 2.2 - Apr 2, 2012 + +- Add an optimized SHA-256d algorithm, with specialized code + for x86 and x86-64 and support for AVX and XOP instructions +- Slight performance increase for scrypt on x86 and x86-64 +- The default timeout is now 270 seconds + +Version 2.1.5 - Mar 7, 2012 + +- Add optimizations for AVX-capable x86-64 processors +- Assume HTTP if no protocol is specified for the mining server +- Fix MinGW compatibility issues and update build instructions +- Add support for building on Solaris using gcc (pontius) + +Version 2.1.4 - Feb 28, 2012 + +- Implement 4-way SHA-256 on x86-64 +- Add TCP keepalive to long polling connections +- Support HTTP and SOCKS proxies via the --proxy option +- Username and password are no longer mandatory +- Add a script that makes assembly code compatible with old versions + of the GNU assembler that do not support macros + +Version 2.1.3 - Feb 12, 2012 + +- Smart handling of long polling failures: switch to short scan time + if long polling fails, and only try to reactivate it if the server + continues to advertise the feature in HTTP headers +- Add "X-Mining-Extensions: midstate" to HTTP headers (p2k) +- Add support for the "submitold" extension, used by p2pool +- It is now possible to specify username and password in the URL, + like this: http://username:password@host:port/ +- Add a --version option, and clean up --help output +- Avoid division by zero when computing hash rates +- Handle empty responses properly (TimothyA) +- Eliminate the delay between starting threads + +Version 2.1.2 - Jan 26, 2012 + +- Do not submit work that is known to be stale +- Allow miner threads to ask for new work if the current one is at least + 45 seconds old and long polling is enabled +- Refresh work when long polling times out +- Fix minor speed regression +- Modify x86-64 code to make it compatible with older versions of binutils + +Version 2.1.1 - Jan 20, 2012 + +- Handle network errors properly +- Make scantime retargeting more accurate + +Version 2.1 - Jan 19, 2012 + +- Share the same work among all threads +- Do not ask for new work if the current one is not expired +- Do not discard the work returned by long polling + +Version 2.0 - Jan 16, 2012 + +- Change default port to 9332 for Litecoin and remove default credentials +- Add 'scrypt' as the default algorithm and remove other algorithms (ArtForz) +- Optimize scrypt for x86 and x86-64 +- Make scantime retargeting less granular (ArtForz) +- Test the whole hash instead of just looking at the high 32 bits +- Add configurable timeout, with a default of 180 seconds +- Add share summary output (inlikeflynn) +- Fix priority and CPU count detection on Windows +- Fix parameters -u and -p, and add short options -o and -O + +Version 1.0.2 - Jun 13, 2011 + +- Linux x86_64 optimisations - Con Kolivas +- Optimise for x86_64 by default by using sse2_64 algo +- Detects CPUs and sets number of threads accordingly +- Uses CPU affinity for each thread where appropriate +- Sets scheduling policy to lowest possible +- Minor performance tweaks + +Version 1.0.1 - May 14, 2011 + +- OSX support + +Version 1.0 - May 9, 2011 + +- jansson 2.0 compatibility +- correct off-by-one in date (month) display output +- fix platform detection +- improve yasm configure bits +- support full URL, in X-Long-Polling header + +Version 0.8.1 - March 22, 2011 + +- Make --user, --pass actually work + +- Add User-Agent HTTP header to requests, so that server operators may + more easily identify the miner client. + +- Fix minor bug in example JSON config file + +Version 0.8 - March 21, 2011 + +- Support long polling: http://deepbit.net/longpolling.php + +- Adjust max workload based on scantime (default 5 seconds, + or 60 seconds for longpoll) + +- Standardize program output, and support syslog on Unix platforms + +- Suport --user/--pass options (and "user" and "pass" in config file), + as an alternative to the current --userpass + +Version 0.7.2 - March 14, 2011 + +- Add port of ufasoft's sse2 assembly implementation (Linux only) + This is a substantial speed improvement on Intel CPUs. + +- Move all JSON-RPC I/O to separate thread. This reduces the + number of HTTP connections from one-per-thread to one, reducing resource + usage on upstream bitcoind / pool server. + +Version 0.7.1 - March 2, 2011 + +- Add support for JSON-format configuration file. See example + file example-cfg.json. Any long argument on the command line + may be stored in the config file. +- Timestamp each solution found +- Improve sha256_4way performance. NOTE: This optimization makes + the 'hash' debug-print output for sha256_way incorrect. +- Use __builtin_expect() intrinsic as compiler micro-optimization +- Build on Intel compiler +- HTTP library now follows HTTP redirects + +Version 0.7 - February 12, 2011 + +- Re-use CURL object, thereby reuseing DNS cache and HTTP connections +- Use bswap_32, if compiler intrinsic is not available +- Disable full target validation (as opposed to simply H==0) for now + +Version 0.6.1 - February 4, 2011 + +- Fully validate "hash < target", rather than simply stopping our scan + if the high 32 bits are 00000000. +- Add --retry-pause, to set length of pause time between failure retries +- Display proof-of-work hash and target, if -D (debug mode) enabled +- Fix max-nonce auto-adjustment to actually work. This means if your + scan takes longer than 5 seconds (--scantime), the miner will slowly + reduce the number of hashes you work on, before fetching a new work unit. + +Version 0.6 - January 29, 2011 + +- Fetch new work unit, if scanhash takes longer than 5 seconds (--scantime) +- BeeCee1's sha256 4way optimizations +- lfm's byte swap optimization (improves via, cryptopp) +- Fix non-working short options -q, -r + +Version 0.5 - December 28, 2010 + +- Exit program, when all threads have exited +- Improve JSON-RPC failure diagnostics and resilience +- Add --quiet option, to disable hashmeter output. + +Version 0.3.3 - December 27, 2010 + +- Critical fix for sha256_cryptopp 'cryptopp_asm' algo + +Version 0.3.2 - December 23, 2010 + +- Critical fix for sha256_via + +Version 0.3.1 - December 19, 2010 + +- Critical fix for sha256_via +- Retry JSON-RPC failures (see --retry, under "minerd --help" output) + +Version 0.3 - December 18, 2010 + +- Add crypto++ 32bit assembly implementation +- show version upon 'minerd --help' +- work around gcc 4.5.x bug that killed 4way performance + +Version 0.2.2 - December 6, 2010 + +- VIA padlock implementation works now +- Minor build and runtime fixes + +Version 0.2.1 - November 29, 2010 + +- avoid buffer overflow when submitting solutions +- add Crypto++ sha256 implementation (C only, ASM elided for now) +- minor internal optimizations and cleanups + +Version 0.2 - November 27, 2010 + +- Add script for building a Windows installer +- improve hash performance (hashmeter) statistics +- add tcatm 4way sha256 implementation +- Add experimental VIA Padlock sha256 implementation + +Version 0.1.2 - November 26, 2010 + +- many small cleanups and micro-optimizations +- build win32 exe using mingw +- RPC URL, username/password become command line arguments +- remove unused OpenSSL dependency + +Version 0.1.1 - November 24, 2010 + +- Do not build sha256_generic module separately from cpuminer. + +Version 0.1 - November 24, 2010 + +- Initial release. + diff --git a/README b/README new file mode 100644 index 00000000..2dd93ac5 --- /dev/null +++ b/README @@ -0,0 +1,68 @@ +This is a multi-threaded CPU miner for Litecoin and Bitcoin, +fork of Jeff Garzik's reference cpuminer. + +License: GPLv2. See COPYING for details. + +Downloads: https://sourceforge.net/projects/cpuminer/files/ +Git tree: https://github.com/pooler/cpuminer + +Dependencies: + libcurl http://curl.haxx.se/libcurl/ + jansson http://www.digip.org/jansson/ + (jansson is included in-tree) + +Basic *nix build instructions: + ./autogen.sh # only needed if building from git repo + ./nomacro.pl # only needed if building on Mac OS X or with Clang + ./configure CFLAGS="-O3" + make + +Notes for AIX users: + * To build a 64-bit binary, export OBJECT_MODE=64 + * GNU-style long options are not supported, but are accessible + via configuration file + +Basic Windows build instructions, using MinGW: + Install MinGW and the MSYS Developer Tool Kit (http://www.mingw.org/) + * Make sure you have mstcpip.h in MinGW\include + If using MinGW-w64, install pthreads-w64 + Install libcurl devel (http://curl.haxx.se/download.html) + * Make sure you have libcurl.m4 in MinGW\share\aclocal + * Make sure you have curl-config in MinGW\bin + In the MSYS shell, run: + ./autogen.sh # only needed if building from git repo + LIBCURL="-lcurldll" ./configure CFLAGS="-O3" + make + +Architecture-specific notes: + ARM: No runtime CPU detection. The miner can take advantage + of some instructions specific to ARMv5E and later processors, + but the decision whether to use them is made at compile time, + based on compiler-defined macros. + To use NEON instructions, add "-mfpu=neon" to CFLAGS. + x86: The miner checks for SSE2 instructions support at runtime, + and uses them if they are available. + x86-64: The miner can take advantage of AVX, AVX2 and XOP instructions, + but only if both the CPU and the operating system support them. + * Linux supports AVX starting from kernel version 2.6.30. + * FreeBSD supports AVX starting with 9.1-RELEASE. + * Mac OS X added AVX support in the 10.6.8 update. + * Windows supports AVX starting from Windows 7 SP1 and + Windows Server 2008 R2 SP1. + The configure script outputs a warning if the assembler + doesn't support some instruction sets. In that case, the miner + can still be built, but unavailable optimizations are left off. + +Usage instructions: Run "minerd --help" to see options. + +Connecting through a proxy: Use the --proxy option. +To use a SOCKS proxy, add a socks4:// or socks5:// prefix to the proxy host. +Protocols socks4a and socks5h, allowing remote name resolving, are also +available since libcurl 7.18.0. +If no protocol is specified, the proxy is assumed to be a HTTP proxy. +When the --proxy option is not used, the program honors the http_proxy +and all_proxy environment variables. + +Also many issues and FAQs are covered in the forum thread +dedicated to this program, + https://bitcointalk.org/index.php?topic=55038.0 diff --git a/autogen.sh b/autogen.sh new file mode 100644 index 00000000..989604a9 --- /dev/null +++ b/autogen.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +# You need autoconf 2.5x, preferably 2.57 or later +# You need automake 1.7 or later. 1.6 might work. + +set -e + +aclocal +autoheader +automake --gnu --add-missing --copy +autoconf + diff --git a/compat.h b/compat.h new file mode 100644 index 00000000..283fc9b6 --- /dev/null +++ b/compat.h @@ -0,0 +1,21 @@ +#ifndef __COMPAT_H__ +#define __COMPAT_H__ + +#ifdef WIN32 + +#include + +#define sleep(secs) Sleep((secs) * 1000) + +enum { + PRIO_PROCESS = 0, +}; + +static inline int setpriority(int which, int who, int prio) +{ + return -!SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_IDLE); +} + +#endif /* WIN32 */ + +#endif /* __COMPAT_H__ */ diff --git a/compat/Makefile.am b/compat/Makefile.am new file mode 100644 index 00000000..77af3c54 --- /dev/null +++ b/compat/Makefile.am @@ -0,0 +1,7 @@ + +if WANT_JANSSON +SUBDIRS = jansson +else +SUBDIRS = +endif + diff --git a/compat/jansson/.gitignore b/compat/jansson/.gitignore new file mode 100644 index 00000000..173737b6 --- /dev/null +++ b/compat/jansson/.gitignore @@ -0,0 +1,3 @@ + +libjansson.a + diff --git a/compat/jansson/LICENSE b/compat/jansson/LICENSE new file mode 100644 index 00000000..552b3498 --- /dev/null +++ b/compat/jansson/LICENSE @@ -0,0 +1,19 @@ +Copyright (c) 2009, 2010 Petri Lehtinen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/compat/jansson/Makefile.am b/compat/jansson/Makefile.am new file mode 100644 index 00000000..94a583f3 --- /dev/null +++ b/compat/jansson/Makefile.am @@ -0,0 +1,18 @@ + +noinst_LIBRARIES = libjansson.a + +libjansson_a_SOURCES = \ + config.h \ + dump.c \ + hashtable.c \ + hashtable.h \ + jansson.h \ + jansson_private.h \ + load.c \ + strbuffer.c \ + strbuffer.h \ + utf.c \ + utf.h \ + util.h \ + value.c + diff --git a/compat/jansson/config.h b/compat/jansson/config.h new file mode 100644 index 00000000..43858aa6 --- /dev/null +++ b/compat/jansson/config.h @@ -0,0 +1,73 @@ +/* config.h. Generated from config.h.in by configure. */ +/* config.h.in. Generated from configure.ac by autoheader. */ + +/* Define to 1 if you have the header file. */ +#define HAVE_DLFCN_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to the sub-directory in which libtool stores uninstalled libraries. + */ +#define LT_OBJDIR ".libs/" + +/* Name of package */ +#define PACKAGE "jansson" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "petri@digip.org" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "jansson" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "jansson 1.3" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "jansson" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "1.3" + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Version number of package */ +#define VERSION "1.3" + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to the type of a signed integer type of width exactly 32 bits if + such a type exists and the standard includes do not define it. */ +/* #undef int32_t */ diff --git a/compat/jansson/dump.c b/compat/jansson/dump.c new file mode 100644 index 00000000..dc27fbde --- /dev/null +++ b/compat/jansson/dump.c @@ -0,0 +1,460 @@ +/* + * Copyright (c) 2009, 2010 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include + +#include +#include "jansson_private.h" +#include "strbuffer.h" +#include "utf.h" + +#define MAX_INTEGER_STR_LENGTH 100 +#define MAX_REAL_STR_LENGTH 100 + +typedef int (*dump_func)(const char *buffer, int size, void *data); + +struct string +{ + char *buffer; + int length; + int size; +}; + +static int dump_to_strbuffer(const char *buffer, int size, void *data) +{ + return strbuffer_append_bytes((strbuffer_t *)data, buffer, size); +} + +static int dump_to_file(const char *buffer, int size, void *data) +{ + FILE *dest = (FILE *)data; + if(fwrite(buffer, size, 1, dest) != 1) + return -1; + return 0; +} + +/* 256 spaces (the maximum indentation size) */ +static char whitespace[] = " "; + +static int dump_indent(unsigned long flags, int depth, int space, dump_func dump, void *data) +{ + if(JSON_INDENT(flags) > 0) + { + int i, ws_count = JSON_INDENT(flags); + + if(dump("\n", 1, data)) + return -1; + + for(i = 0; i < depth; i++) + { + if(dump(whitespace, ws_count, data)) + return -1; + } + } + else if(space && !(flags & JSON_COMPACT)) + { + return dump(" ", 1, data); + } + return 0; +} + +static int dump_string(const char *str, int ascii, dump_func dump, void *data) +{ + const char *pos, *end; + int32_t codepoint; + + if(dump("\"", 1, data)) + return -1; + + end = pos = str; + while(1) + { + const char *text; + char seq[13]; + int length; + + while(*end) + { + end = utf8_iterate(pos, &codepoint); + if(!end) + return -1; + + /* mandatory escape or control char */ + if(codepoint == '\\' || codepoint == '"' || codepoint < 0x20) + break; + + /* non-ASCII */ + if(ascii && codepoint > 0x7F) + break; + + pos = end; + } + + if(pos != str) { + if(dump(str, pos - str, data)) + return -1; + } + + if(end == pos) + break; + + /* handle \, ", and control codes */ + length = 2; + switch(codepoint) + { + case '\\': text = "\\\\"; break; + case '\"': text = "\\\""; break; + case '\b': text = "\\b"; break; + case '\f': text = "\\f"; break; + case '\n': text = "\\n"; break; + case '\r': text = "\\r"; break; + case '\t': text = "\\t"; break; + default: + { + /* codepoint is in BMP */ + if(codepoint < 0x10000) + { + sprintf(seq, "\\u%04x", codepoint); + length = 6; + } + + /* not in BMP -> construct a UTF-16 surrogate pair */ + else + { + int32_t first, last; + + codepoint -= 0x10000; + first = 0xD800 | ((codepoint & 0xffc00) >> 10); + last = 0xDC00 | (codepoint & 0x003ff); + + sprintf(seq, "\\u%04x\\u%04x", first, last); + length = 12; + } + + text = seq; + break; + } + } + + if(dump(text, length, data)) + return -1; + + str = pos = end; + } + + return dump("\"", 1, data); +} + +static int object_key_compare_keys(const void *key1, const void *key2) +{ + return strcmp((*(const object_key_t **)key1)->key, + (*(const object_key_t **)key2)->key); +} + +static int object_key_compare_serials(const void *key1, const void *key2) +{ + return (*(const object_key_t **)key1)->serial - + (*(const object_key_t **)key2)->serial; +} + +static int do_dump(const json_t *json, unsigned long flags, int depth, + dump_func dump, void *data) +{ + int ascii = flags & JSON_ENSURE_ASCII ? 1 : 0; + + switch(json_typeof(json)) { + case JSON_NULL: + return dump("null", 4, data); + + case JSON_TRUE: + return dump("true", 4, data); + + case JSON_FALSE: + return dump("false", 5, data); + + case JSON_INTEGER: + { + char buffer[MAX_INTEGER_STR_LENGTH]; + int size; + + size = snprintf(buffer, MAX_INTEGER_STR_LENGTH, "%d", json_integer_value(json)); + if(size >= MAX_INTEGER_STR_LENGTH) + return -1; + + return dump(buffer, size, data); + } + + case JSON_REAL: + { + char buffer[MAX_REAL_STR_LENGTH]; + int size; + + size = snprintf(buffer, MAX_REAL_STR_LENGTH, "%.17g", + json_real_value(json)); + if(size >= MAX_REAL_STR_LENGTH) + return -1; + + /* Make sure there's a dot or 'e' in the output. Otherwise + a real is converted to an integer when decoding */ + if(strchr(buffer, '.') == NULL && + strchr(buffer, 'e') == NULL) + { + if(size + 2 >= MAX_REAL_STR_LENGTH) { + /* No space to append ".0" */ + return -1; + } + buffer[size] = '.'; + buffer[size + 1] = '0'; + size += 2; + } + + return dump(buffer, size, data); + } + + case JSON_STRING: + return dump_string(json_string_value(json), ascii, dump, data); + + case JSON_ARRAY: + { + int i; + int n; + json_array_t *array; + + /* detect circular references */ + array = json_to_array(json); + if(array->visited) + goto array_error; + array->visited = 1; + + n = json_array_size(json); + + if(dump("[", 1, data)) + goto array_error; + if(n == 0) { + array->visited = 0; + return dump("]", 1, data); + } + if(dump_indent(flags, depth + 1, 0, dump, data)) + goto array_error; + + for(i = 0; i < n; ++i) { + if(do_dump(json_array_get(json, i), flags, depth + 1, + dump, data)) + goto array_error; + + if(i < n - 1) + { + if(dump(",", 1, data) || + dump_indent(flags, depth + 1, 1, dump, data)) + goto array_error; + } + else + { + if(dump_indent(flags, depth, 0, dump, data)) + goto array_error; + } + } + + array->visited = 0; + return dump("]", 1, data); + + array_error: + array->visited = 0; + return -1; + } + + case JSON_OBJECT: + { + json_object_t *object; + void *iter; + const char *separator; + int separator_length; + + if(flags & JSON_COMPACT) { + separator = ":"; + separator_length = 1; + } + else { + separator = ": "; + separator_length = 2; + } + + /* detect circular references */ + object = json_to_object(json); + if(object->visited) + goto object_error; + object->visited = 1; + + iter = json_object_iter((json_t *)json); + + if(dump("{", 1, data)) + goto object_error; + if(!iter) { + object->visited = 0; + return dump("}", 1, data); + } + if(dump_indent(flags, depth + 1, 0, dump, data)) + goto object_error; + + if(flags & JSON_SORT_KEYS || flags & JSON_PRESERVE_ORDER) + { + const object_key_t **keys; + unsigned int size; + unsigned int i; + int (*cmp_func)(const void *, const void *); + + size = json_object_size(json); + keys = malloc(size * sizeof(object_key_t *)); + if(!keys) + goto object_error; + + i = 0; + while(iter) + { + keys[i] = jsonp_object_iter_fullkey(iter); + iter = json_object_iter_next((json_t *)json, iter); + i++; + } + assert(i == size); + + if(flags & JSON_SORT_KEYS) + cmp_func = object_key_compare_keys; + else + cmp_func = object_key_compare_serials; + + qsort(keys, size, sizeof(object_key_t *), cmp_func); + + for(i = 0; i < size; i++) + { + const char *key; + json_t *value; + + key = keys[i]->key; + value = json_object_get(json, key); + assert(value); + + dump_string(key, ascii, dump, data); + if(dump(separator, separator_length, data) || + do_dump(value, flags, depth + 1, dump, data)) + { + free(keys); + goto object_error; + } + + if(i < size - 1) + { + if(dump(",", 1, data) || + dump_indent(flags, depth + 1, 1, dump, data)) + { + free(keys); + goto object_error; + } + } + else + { + if(dump_indent(flags, depth, 0, dump, data)) + { + free(keys); + goto object_error; + } + } + } + + free(keys); + } + else + { + /* Don't sort keys */ + + while(iter) + { + void *next = json_object_iter_next((json_t *)json, iter); + + dump_string(json_object_iter_key(iter), ascii, dump, data); + if(dump(separator, separator_length, data) || + do_dump(json_object_iter_value(iter), flags, depth + 1, + dump, data)) + goto object_error; + + if(next) + { + if(dump(",", 1, data) || + dump_indent(flags, depth + 1, 1, dump, data)) + goto object_error; + } + else + { + if(dump_indent(flags, depth, 0, dump, data)) + goto object_error; + } + + iter = next; + } + } + + object->visited = 0; + return dump("}", 1, data); + + object_error: + object->visited = 0; + return -1; + } + + default: + /* not reached */ + return -1; + } +} + + +char *json_dumps(const json_t *json, unsigned long flags) +{ + strbuffer_t strbuff; + char *result; + + if(!json_is_array(json) && !json_is_object(json)) + return NULL; + + if(strbuffer_init(&strbuff)) + return NULL; + + if(do_dump(json, flags, 0, dump_to_strbuffer, (void *)&strbuff)) { + strbuffer_close(&strbuff); + return NULL; + } + + result = strdup(strbuffer_value(&strbuff)); + strbuffer_close(&strbuff); + + return result; +} + +int json_dumpf(const json_t *json, FILE *output, unsigned long flags) +{ + if(!json_is_array(json) && !json_is_object(json)) + return -1; + + return do_dump(json, flags, 0, dump_to_file, (void *)output); +} + +int json_dump_file(const json_t *json, const char *path, unsigned long flags) +{ + int result; + + FILE *output = fopen(path, "w"); + if(!output) + return -1; + + result = json_dumpf(json, output, flags); + + fclose(output); + return result; +} diff --git a/compat/jansson/hashtable.c b/compat/jansson/hashtable.c new file mode 100644 index 00000000..a3120479 --- /dev/null +++ b/compat/jansson/hashtable.c @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2009, 2010 Petri Lehtinen + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#include + +#include +#include "hashtable.h" + +typedef struct hashtable_list list_t; +typedef struct hashtable_pair pair_t; +typedef struct hashtable_bucket bucket_t; + +#define container_of(ptr_, type_, member_) \ + ((type_ *)((char *)ptr_ - (size_t)&((type_ *)0)->member_)) + +#define list_to_pair(list_) container_of(list_, pair_t, list) + +static inline void list_init(list_t *list) +{ + list->next = list; + list->prev = list; +} + +static inline void list_insert(list_t *list, list_t *node) +{ + node->next = list; + node->prev = list->prev; + list->prev->next = node; + list->prev = node; +} + +static inline void list_remove(list_t *list) +{ + list->prev->next = list->next; + list->next->prev = list->prev; +} + +static inline int bucket_is_empty(hashtable_t *hashtable, bucket_t *bucket) +{ + return bucket->first == &hashtable->list && bucket->first == bucket->last; +} + +static void insert_to_bucket(hashtable_t *hashtable, bucket_t *bucket, + list_t *list) +{ + if(bucket_is_empty(hashtable, bucket)) + { + list_insert(&hashtable->list, list); + bucket->first = bucket->last = list; + } + else + { + list_insert(bucket->first, list); + bucket->first = list; + } +} + +static unsigned int primes[] = { + 5, 13, 23, 53, 97, 193, 389, 769, 1543, 3079, 6151, 12289, 24593, + 49157, 98317, 196613, 393241, 786433, 1572869, 3145739, 6291469, + 12582917, 25165843, 50331653, 100663319, 201326611, 402653189, + 805306457, 1610612741 +}; +static const unsigned int num_primes = sizeof(primes) / sizeof(unsigned int); + +static inline unsigned int num_buckets(hashtable_t *hashtable) +{ + return primes[hashtable->num_buckets]; +} + + +static pair_t *hashtable_find_pair(hashtable_t *hashtable, bucket_t *bucket, + const void *key, unsigned int hash) +{ + list_t *list; + pair_t *pair; + + if(bucket_is_empty(hashtable, bucket)) + return NULL; + + list = bucket->first; + while(1) + { + pair = list_to_pair(list); + if(pair->hash == hash && hashtable->cmp_keys(pair->key, key)) + return pair; + + if(list == bucket->last) + break; + + list = list->next; + } + + return NULL; +} + +/* returns 0 on success, -1 if key was not found */ +static int hashtable_do_del(hashtable_t *hashtable, + const void *key, unsigned int hash) +{ + pair_t *pair; + bucket_t *bucket; + unsigned int index; + + index = hash % num_buckets(hashtable); + bucket = &hashtable->buckets[index]; + + pair = hashtable_find_pair(hashtable, bucket, key, hash); + if(!pair) + return -1; + + if(&pair->list == bucket->first && &pair->list == bucket->last) + bucket->first = bucket->last = &hashtable->list; + + else if(&pair->list == bucket->first) + bucket->first = pair->list.next; + + else if(&pair->list == bucket->last) + bucket->last = pair->list.prev; + + list_remove(&pair->list); + + if(hashtable->free_key) + hashtable->free_key(pair->key); + if(hashtable->free_value) + hashtable->free_value(pair->value); + + free(pair); + hashtable->size--; + + return 0; +} + +static void hashtable_do_clear(hashtable_t *hashtable) +{ + list_t *list, *next; + pair_t *pair; + + for(list = hashtable->list.next; list != &hashtable->list; list = next) + { + next = list->next; + pair = list_to_pair(list); + if(hashtable->free_key) + hashtable->free_key(pair->key); + if(hashtable->free_value) + hashtable->free_value(pair->value); + free(pair); + } +} + +static int hashtable_do_rehash(hashtable_t *hashtable) +{ + list_t *list, *next; + pair_t *pair; + unsigned int i, index, new_size; + + free(hashtable->buckets); + + hashtable->num_buckets++; + new_size = num_buckets(hashtable); + + hashtable->buckets = malloc(new_size * sizeof(bucket_t)); + if(!hashtable->buckets) + return -1; + + for(i = 0; i < num_buckets(hashtable); i++) + { + hashtable->buckets[i].first = hashtable->buckets[i].last = + &hashtable->list; + } + + list = hashtable->list.next; + list_init(&hashtable->list); + + for(; list != &hashtable->list; list = next) { + next = list->next; + pair = list_to_pair(list); + index = pair->hash % new_size; + insert_to_bucket(hashtable, &hashtable->buckets[index], &pair->list); + } + + return 0; +} + + +hashtable_t *hashtable_create(key_hash_fn hash_key, key_cmp_fn cmp_keys, + free_fn free_key, free_fn free_value) +{ + hashtable_t *hashtable = malloc(sizeof(hashtable_t)); + if(!hashtable) + return NULL; + + if(hashtable_init(hashtable, hash_key, cmp_keys, free_key, free_value)) + { + free(hashtable); + return NULL; + } + + return hashtable; +} + +void hashtable_destroy(hashtable_t *hashtable) +{ + hashtable_close(hashtable); + free(hashtable); +} + +int hashtable_init(hashtable_t *hashtable, + key_hash_fn hash_key, key_cmp_fn cmp_keys, + free_fn free_key, free_fn free_value) +{ + unsigned int i; + + hashtable->size = 0; + hashtable->num_buckets = 0; /* index to primes[] */ + hashtable->buckets = malloc(num_buckets(hashtable) * sizeof(bucket_t)); + if(!hashtable->buckets) + return -1; + + list_init(&hashtable->list); + + hashtable->hash_key = hash_key; + hashtable->cmp_keys = cmp_keys; + hashtable->free_key = free_key; + hashtable->free_value = free_value; + + for(i = 0; i < num_buckets(hashtable); i++) + { + hashtable->buckets[i].first = hashtable->buckets[i].last = + &hashtable->list; + } + + return 0; +} + +void hashtable_close(hashtable_t *hashtable) +{ + hashtable_do_clear(hashtable); + free(hashtable->buckets); +} + +int hashtable_set(hashtable_t *hashtable, void *key, void *value) +{ + pair_t *pair; + bucket_t *bucket; + unsigned int hash, index; + + /* rehash if the load ratio exceeds 1 */ + if(hashtable->size >= num_buckets(hashtable)) + if(hashtable_do_rehash(hashtable)) + return -1; + + hash = hashtable->hash_key(key); + index = hash % num_buckets(hashtable); + bucket = &hashtable->buckets[index]; + pair = hashtable_find_pair(hashtable, bucket, key, hash); + + if(pair) + { + if(hashtable->free_key) + hashtable->free_key(key); + if(hashtable->free_value) + hashtable->free_value(pair->value); + pair->value = value; + } + else + { + pair = malloc(sizeof(pair_t)); + if(!pair) + return -1; + + pair->key = key; + pair->value = value; + pair->hash = hash; + list_init(&pair->list); + + insert_to_bucket(hashtable, bucket, &pair->list); + + hashtable->size++; + } + return 0; +} + +void *hashtable_get(hashtable_t *hashtable, const void *key) +{ + pair_t *pair; + unsigned int hash; + bucket_t *bucket; + + hash = hashtable->hash_key(key); + bucket = &hashtable->buckets[hash % num_buckets(hashtable)]; + + pair = hashtable_find_pair(hashtable, bucket, key, hash); + if(!pair) + return NULL; + + return pair->value; +} + +int hashtable_del(hashtable_t *hashtable, const void *key) +{ + unsigned int hash = hashtable->hash_key(key); + return hashtable_do_del(hashtable, key, hash); +} + +void hashtable_clear(hashtable_t *hashtable) +{ + unsigned int i; + + hashtable_do_clear(hashtable); + + for(i = 0; i < num_buckets(hashtable); i++) + { + hashtable->buckets[i].first = hashtable->buckets[i].last = + &hashtable->list; + } + + list_init(&hashtable->list); + hashtable->size = 0; +} + +void *hashtable_iter(hashtable_t *hashtable) +{ + return hashtable_iter_next(hashtable, &hashtable->list); +} + +void *hashtable_iter_at(hashtable_t *hashtable, const void *key) +{ + pair_t *pair; + unsigned int hash; + bucket_t *bucket; + + hash = hashtable->hash_key(key); + bucket = &hashtable->buckets[hash % num_buckets(hashtable)]; + + pair = hashtable_find_pair(hashtable, bucket, key, hash); + if(!pair) + return NULL; + + return &pair->list; +} + +void *hashtable_iter_next(hashtable_t *hashtable, void *iter) +{ + list_t *list = (list_t *)iter; + if(list->next == &hashtable->list) + return NULL; + return list->next; +} + +void *hashtable_iter_key(void *iter) +{ + pair_t *pair = list_to_pair((list_t *)iter); + return pair->key; +} + +void *hashtable_iter_value(void *iter) +{ + pair_t *pair = list_to_pair((list_t *)iter); + return pair->value; +} + +void hashtable_iter_set(hashtable_t *hashtable, void *iter, void *value) +{ + pair_t *pair = list_to_pair((list_t *)iter); + + if(hashtable->free_value) + hashtable->free_value(pair->value); + + pair->value = value; +} diff --git a/compat/jansson/hashtable.h b/compat/jansson/hashtable.h new file mode 100644 index 00000000..f03a7690 --- /dev/null +++ b/compat/jansson/hashtable.h @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2009, 2010 Petri Lehtinen + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#ifndef HASHTABLE_H +#define HASHTABLE_H + +typedef unsigned int (*key_hash_fn)(const void *key); +typedef int (*key_cmp_fn)(const void *key1, const void *key2); +typedef void (*free_fn)(void *key); + +struct hashtable_list { + struct hashtable_list *prev; + struct hashtable_list *next; +}; + +struct hashtable_pair { + void *key; + void *value; + unsigned int hash; + struct hashtable_list list; +}; + +struct hashtable_bucket { + struct hashtable_list *first; + struct hashtable_list *last; +}; + +typedef struct hashtable { + unsigned int size; + struct hashtable_bucket *buckets; + unsigned int num_buckets; /* index to primes[] */ + struct hashtable_list list; + + key_hash_fn hash_key; + key_cmp_fn cmp_keys; /* returns non-zero for equal keys */ + free_fn free_key; + free_fn free_value; +} hashtable_t; + +/** + * hashtable_create - Create a hashtable object + * + * @hash_key: The key hashing function + * @cmp_keys: The key compare function. Returns non-zero for equal and + * zero for unequal unequal keys + * @free_key: If non-NULL, called for a key that is no longer referenced. + * @free_value: If non-NULL, called for a value that is no longer referenced. + * + * Returns a new hashtable object that should be freed with + * hashtable_destroy when it's no longer used, or NULL on failure (out + * of memory). + */ +hashtable_t *hashtable_create(key_hash_fn hash_key, key_cmp_fn cmp_keys, + free_fn free_key, free_fn free_value); + +/** + * hashtable_destroy - Destroy a hashtable object + * + * @hashtable: The hashtable + * + * Destroys a hashtable created with hashtable_create(). + */ +void hashtable_destroy(hashtable_t *hashtable); + +/** + * hashtable_init - Initialize a hashtable object + * + * @hashtable: The (statically allocated) hashtable object + * @hash_key: The key hashing function + * @cmp_keys: The key compare function. Returns non-zero for equal and + * zero for unequal unequal keys + * @free_key: If non-NULL, called for a key that is no longer referenced. + * @free_value: If non-NULL, called for a value that is no longer referenced. + * + * Initializes a statically allocated hashtable object. The object + * should be cleared with hashtable_close when it's no longer used. + * + * Returns 0 on success, -1 on error (out of memory). + */ +int hashtable_init(hashtable_t *hashtable, + key_hash_fn hash_key, key_cmp_fn cmp_keys, + free_fn free_key, free_fn free_value); + +/** + * hashtable_close - Release all resources used by a hashtable object + * + * @hashtable: The hashtable + * + * Destroys a statically allocated hashtable object. + */ +void hashtable_close(hashtable_t *hashtable); + +/** + * hashtable_set - Add/modify value in hashtable + * + * @hashtable: The hashtable object + * @key: The key + * @value: The value + * + * If a value with the given key already exists, its value is replaced + * with the new value. + * + * Key and value are "stealed" in the sense that hashtable frees them + * automatically when they are no longer used. The freeing is + * accomplished by calling free_key and free_value functions that were + * supplied to hashtable_new. In case one or both of the free + * functions is NULL, the corresponding item is not "stealed". + * + * Returns 0 on success, -1 on failure (out of memory). + */ +int hashtable_set(hashtable_t *hashtable, void *key, void *value); + +/** + * hashtable_get - Get a value associated with a key + * + * @hashtable: The hashtable object + * @key: The key + * + * Returns value if it is found, or NULL otherwise. + */ +void *hashtable_get(hashtable_t *hashtable, const void *key); + +/** + * hashtable_del - Remove a value from the hashtable + * + * @hashtable: The hashtable object + * @key: The key + * + * Returns 0 on success, or -1 if the key was not found. + */ +int hashtable_del(hashtable_t *hashtable, const void *key); + +/** + * hashtable_clear - Clear hashtable + * + * @hashtable: The hashtable object + * + * Removes all items from the hashtable. + */ +void hashtable_clear(hashtable_t *hashtable); + +/** + * hashtable_iter - Iterate over hashtable + * + * @hashtable: The hashtable object + * + * Returns an opaque iterator to the first element in the hashtable. + * The iterator should be passed to hashtable_iter_* functions. + * The hashtable items are not iterated over in any particular order. + * + * There's no need to free the iterator in any way. The iterator is + * valid as long as the item that is referenced by the iterator is not + * deleted. Other values may be added or deleted. In particular, + * hashtable_iter_next() may be called on an iterator, and after that + * the key/value pair pointed by the old iterator may be deleted. + */ +void *hashtable_iter(hashtable_t *hashtable); + +/** + * hashtable_iter_at - Return an iterator at a specific key + * + * @hashtable: The hashtable object + * @key: The key that the iterator should point to + * + * Like hashtable_iter() but returns an iterator pointing to a + * specific key. + */ +void *hashtable_iter_at(hashtable_t *hashtable, const void *key); + +/** + * hashtable_iter_next - Advance an iterator + * + * @hashtable: The hashtable object + * @iter: The iterator + * + * Returns a new iterator pointing to the next element in the + * hashtable or NULL if the whole hastable has been iterated over. + */ +void *hashtable_iter_next(hashtable_t *hashtable, void *iter); + +/** + * hashtable_iter_key - Retrieve the key pointed by an iterator + * + * @iter: The iterator + */ +void *hashtable_iter_key(void *iter); + +/** + * hashtable_iter_value - Retrieve the value pointed by an iterator + * + * @iter: The iterator + */ +void *hashtable_iter_value(void *iter); + +/** + * hashtable_iter_set - Set the value pointed by an iterator + * + * @iter: The iterator + * @value: The value to set + */ +void hashtable_iter_set(hashtable_t *hashtable, void *iter, void *value); + +#endif diff --git a/compat/jansson/jansson.h b/compat/jansson/jansson.h new file mode 100644 index 00000000..4c526fee --- /dev/null +++ b/compat/jansson/jansson.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2009, 2010 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#ifndef JANSSON_H +#define JANSSON_H + +#include + +#ifndef __cplusplus +#define JSON_INLINE inline +#else +#define JSON_INLINE inline +extern "C" { +#endif + +/* types */ + +typedef enum { + JSON_OBJECT, + JSON_ARRAY, + JSON_STRING, + JSON_INTEGER, + JSON_REAL, + JSON_TRUE, + JSON_FALSE, + JSON_NULL +} json_type; + +typedef struct { + json_type type; + unsigned long refcount; +} json_t; + +#define json_typeof(json) ((json)->type) +#define json_is_object(json) (json && json_typeof(json) == JSON_OBJECT) +#define json_is_array(json) (json && json_typeof(json) == JSON_ARRAY) +#define json_is_string(json) (json && json_typeof(json) == JSON_STRING) +#define json_is_integer(json) (json && json_typeof(json) == JSON_INTEGER) +#define json_is_real(json) (json && json_typeof(json) == JSON_REAL) +#define json_is_number(json) (json_is_integer(json) || json_is_real(json)) +#define json_is_true(json) (json && json_typeof(json) == JSON_TRUE) +#define json_is_false(json) (json && json_typeof(json) == JSON_FALSE) +#define json_is_boolean(json) (json_is_true(json) || json_is_false(json)) +#define json_is_null(json) (json && json_typeof(json) == JSON_NULL) + +/* construction, destruction, reference counting */ + +json_t *json_object(void); +json_t *json_array(void); +json_t *json_string(const char *value); +json_t *json_string_nocheck(const char *value); +json_t *json_integer(int value); +json_t *json_real(double value); +json_t *json_true(void); +json_t *json_false(void); +json_t *json_null(void); + +static JSON_INLINE +json_t *json_incref(json_t *json) +{ + if(json && json->refcount != (unsigned int)-1) + ++json->refcount; + return json; +} + +/* do not call json_delete directly */ +void json_delete(json_t *json); + +static JSON_INLINE +void json_decref(json_t *json) +{ + if(json && json->refcount != (unsigned int)-1 && --json->refcount == 0) + json_delete(json); +} + + +/* getters, setters, manipulation */ + +unsigned int json_object_size(const json_t *object); +json_t *json_object_get(const json_t *object, const char *key); +int json_object_set_new(json_t *object, const char *key, json_t *value); +int json_object_set_new_nocheck(json_t *object, const char *key, json_t *value); +int json_object_del(json_t *object, const char *key); +int json_object_clear(json_t *object); +int json_object_update(json_t *object, json_t *other); +void *json_object_iter(json_t *object); +void *json_object_iter_at(json_t *object, const char *key); +void *json_object_iter_next(json_t *object, void *iter); +const char *json_object_iter_key(void *iter); +json_t *json_object_iter_value(void *iter); +int json_object_iter_set_new(json_t *object, void *iter, json_t *value); + +static JSON_INLINE +int json_object_set(json_t *object, const char *key, json_t *value) +{ + return json_object_set_new(object, key, json_incref(value)); +} + +static JSON_INLINE +int json_object_set_nocheck(json_t *object, const char *key, json_t *value) +{ + return json_object_set_new_nocheck(object, key, json_incref(value)); +} + +static inline +int json_object_iter_set(json_t *object, void *iter, json_t *value) +{ + return json_object_iter_set_new(object, iter, json_incref(value)); +} + +unsigned int json_array_size(const json_t *array); +json_t *json_array_get(const json_t *array, unsigned int index); +int json_array_set_new(json_t *array, unsigned int index, json_t *value); +int json_array_append_new(json_t *array, json_t *value); +int json_array_insert_new(json_t *array, unsigned int index, json_t *value); +int json_array_remove(json_t *array, unsigned int index); +int json_array_clear(json_t *array); +int json_array_extend(json_t *array, json_t *other); + +static JSON_INLINE +int json_array_set(json_t *array, unsigned int index, json_t *value) +{ + return json_array_set_new(array, index, json_incref(value)); +} + +static JSON_INLINE +int json_array_append(json_t *array, json_t *value) +{ + return json_array_append_new(array, json_incref(value)); +} + +static JSON_INLINE +int json_array_insert(json_t *array, unsigned int index, json_t *value) +{ + return json_array_insert_new(array, index, json_incref(value)); +} + +const char *json_string_value(const json_t *string); +int json_integer_value(const json_t *integer); +double json_real_value(const json_t *real); +double json_number_value(const json_t *json); + +int json_string_set(json_t *string, const char *value); +int json_string_set_nocheck(json_t *string, const char *value); +int json_integer_set(json_t *integer, int value); +int json_real_set(json_t *real, double value); + + +/* equality */ + +int json_equal(json_t *value1, json_t *value2); + + +/* copying */ + +json_t *json_copy(json_t *value); +json_t *json_deep_copy(json_t *value); + + +/* loading, printing */ + +#define JSON_ERROR_TEXT_LENGTH 160 + +typedef struct { + char text[JSON_ERROR_TEXT_LENGTH]; + int line; +} json_error_t; + +json_t *json_loads(const char *input, json_error_t *error); +json_t *json_loadf(FILE *input, json_error_t *error); +json_t *json_load_file(const char *path, json_error_t *error); + +#define JSON_INDENT(n) (n & 0xFF) +#define JSON_COMPACT 0x100 +#define JSON_ENSURE_ASCII 0x200 +#define JSON_SORT_KEYS 0x400 +#define JSON_PRESERVE_ORDER 0x800 + +char *json_dumps(const json_t *json, unsigned long flags); +int json_dumpf(const json_t *json, FILE *output, unsigned long flags); +int json_dump_file(const json_t *json, const char *path, unsigned long flags); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/compat/jansson/jansson_private.h b/compat/jansson/jansson_private.h new file mode 100644 index 00000000..4490702a --- /dev/null +++ b/compat/jansson/jansson_private.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2009, 2010 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#ifndef JANSSON_PRIVATE_H +#define JANSSON_PRIVATE_H + +#include "jansson.h" +#include "hashtable.h" + +#define container_of(ptr_, type_, member_) \ + ((type_ *)((char *)ptr_ - (size_t)&((type_ *)0)->member_)) + +typedef struct { + json_t json; + hashtable_t hashtable; + unsigned long serial; + int visited; +} json_object_t; + +typedef struct { + json_t json; + unsigned int size; + unsigned int entries; + json_t **table; + int visited; +} json_array_t; + +typedef struct { + json_t json; + char *value; +} json_string_t; + +typedef struct { + json_t json; + double value; +} json_real_t; + +typedef struct { + json_t json; + int value; +} json_integer_t; + +#define json_to_object(json_) container_of(json_, json_object_t, json) +#define json_to_array(json_) container_of(json_, json_array_t, json) +#define json_to_string(json_) container_of(json_, json_string_t, json) +#define json_to_real(json_) container_of(json_, json_real_t, json) +#define json_to_integer(json_) container_of(json_, json_integer_t, json) + +typedef struct { + unsigned long serial; + char key[]; +} object_key_t; + +const object_key_t *jsonp_object_iter_fullkey(void *iter); + +#endif diff --git a/compat/jansson/load.c b/compat/jansson/load.c new file mode 100644 index 00000000..d49a4da5 --- /dev/null +++ b/compat/jansson/load.c @@ -0,0 +1,879 @@ +/* + * Copyright (c) 2009, 2010 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "jansson_private.h" +#include "strbuffer.h" +#include "utf.h" + +#define TOKEN_INVALID -1 +#define TOKEN_EOF 0 +#define TOKEN_STRING 256 +#define TOKEN_INTEGER 257 +#define TOKEN_REAL 258 +#define TOKEN_TRUE 259 +#define TOKEN_FALSE 260 +#define TOKEN_NULL 261 + +/* read one byte from stream, return EOF on end of file */ +typedef int (*get_func)(void *data); + +/* return non-zero if end of file has been reached */ +typedef int (*eof_func)(void *data); + +typedef struct { + get_func get; + eof_func eof; + void *data; + int stream_pos; + char buffer[5]; + int buffer_pos; +} stream_t; + + +typedef struct { + stream_t stream; + strbuffer_t saved_text; + int token; + int line, column; + union { + char *string; + int integer; + double real; + } value; +} lex_t; + + +/*** error reporting ***/ + +static void error_init(json_error_t *error) +{ + if(error) + { + error->text[0] = '\0'; + error->line = -1; + } +} + +static void error_set(json_error_t *error, const lex_t *lex, + const char *msg, ...) +{ + va_list ap; + char text[JSON_ERROR_TEXT_LENGTH]; + + if(!error || error->text[0] != '\0') { + /* error already set */ + return; + } + + va_start(ap, msg); + vsnprintf(text, JSON_ERROR_TEXT_LENGTH, msg, ap); + va_end(ap); + + if(lex) + { + const char *saved_text = strbuffer_value(&lex->saved_text); + error->line = lex->line; + if(saved_text && saved_text[0]) + { + if(lex->saved_text.length <= 20) { + snprintf(error->text, JSON_ERROR_TEXT_LENGTH, + "%s near '%s'", text, saved_text); + } + else + snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text); + } + else + { + snprintf(error->text, JSON_ERROR_TEXT_LENGTH, + "%s near end of file", text); + } + } + else + { + error->line = -1; + snprintf(error->text, JSON_ERROR_TEXT_LENGTH, "%s", text); + } +} + + +/*** lexical analyzer ***/ + +static void +stream_init(stream_t *stream, get_func get, eof_func eof, void *data) +{ + stream->get = get; + stream->eof = eof; + stream->data = data; + stream->stream_pos = 0; + stream->buffer[0] = '\0'; + stream->buffer_pos = 0; +} + +static char stream_get(stream_t *stream, json_error_t *error) +{ + char c; + + if(!stream->buffer[stream->buffer_pos]) + { + stream->buffer[0] = stream->get(stream->data); + stream->buffer_pos = 0; + + c = stream->buffer[0]; + + if((unsigned char)c >= 0x80 && c != (char)EOF) + { + /* multi-byte UTF-8 sequence */ + int i, count; + + count = utf8_check_first(c); + if(!count) + goto out; + + assert(count >= 2); + + for(i = 1; i < count; i++) + stream->buffer[i] = stream->get(stream->data); + + if(!utf8_check_full(stream->buffer, count, NULL)) + goto out; + + stream->stream_pos += count; + stream->buffer[count] = '\0'; + } + else { + stream->buffer[1] = '\0'; + stream->stream_pos++; + } + } + + return stream->buffer[stream->buffer_pos++]; + +out: + error_set(error, NULL, "unable to decode byte 0x%x at position %d", + (unsigned char)c, stream->stream_pos); + + stream->buffer[0] = EOF; + stream->buffer[1] = '\0'; + stream->buffer_pos = 1; + + return EOF; +} + +static void stream_unget(stream_t *stream, char c) +{ + assert(stream->buffer_pos > 0); + stream->buffer_pos--; + assert(stream->buffer[stream->buffer_pos] == c); +} + + +static int lex_get(lex_t *lex, json_error_t *error) +{ + return stream_get(&lex->stream, error); +} + +static int lex_eof(lex_t *lex) +{ + return lex->stream.eof(lex->stream.data); +} + +static void lex_save(lex_t *lex, char c) +{ + strbuffer_append_byte(&lex->saved_text, c); +} + +static int lex_get_save(lex_t *lex, json_error_t *error) +{ + char c = stream_get(&lex->stream, error); + lex_save(lex, c); + return c; +} + +static void lex_unget_unsave(lex_t *lex, char c) +{ + char d; + stream_unget(&lex->stream, c); + d = strbuffer_pop(&lex->saved_text); + assert(c == d); +} + +static void lex_save_cached(lex_t *lex) +{ + while(lex->stream.buffer[lex->stream.buffer_pos] != '\0') + { + lex_save(lex, lex->stream.buffer[lex->stream.buffer_pos]); + lex->stream.buffer_pos++; + } +} + +/* assumes that str points to 'u' plus at least 4 valid hex digits */ +static int32_t decode_unicode_escape(const char *str) +{ + int i; + int32_t value = 0; + + assert(str[0] == 'u'); + + for(i = 1; i <= 4; i++) { + char c = str[i]; + value <<= 4; + if(isdigit(c)) + value += c - '0'; + else if(islower(c)) + value += c - 'a' + 10; + else if(isupper(c)) + value += c - 'A' + 10; + else + assert(0); + } + + return value; +} + +static void lex_scan_string(lex_t *lex, json_error_t *error) +{ + char c; + const char *p; + char *t; + int i; + + lex->value.string = NULL; + lex->token = TOKEN_INVALID; + + c = lex_get_save(lex, error); + + while(c != '"') { + if(c == (char)EOF) { + lex_unget_unsave(lex, c); + if(lex_eof(lex)) + error_set(error, lex, "premature end of input"); + goto out; + } + + else if((unsigned char)c <= 0x1F) { + /* control character */ + lex_unget_unsave(lex, c); + if(c == '\n') + error_set(error, lex, "unexpected newline", c); + else + error_set(error, lex, "control character 0x%x", c); + goto out; + } + + else if(c == '\\') { + c = lex_get_save(lex, error); + if(c == 'u') { + c = lex_get_save(lex, error); + for(i = 0; i < 4; i++) { + if(!isxdigit(c)) { + lex_unget_unsave(lex, c); + error_set(error, lex, "invalid escape"); + goto out; + } + c = lex_get_save(lex, error); + } + } + else if(c == '"' || c == '\\' || c == '/' || c == 'b' || + c == 'f' || c == 'n' || c == 'r' || c == 't') + c = lex_get_save(lex, error); + else { + lex_unget_unsave(lex, c); + error_set(error, lex, "invalid escape"); + goto out; + } + } + else + c = lex_get_save(lex, error); + } + + /* the actual value is at most of the same length as the source + string, because: + - shortcut escapes (e.g. "\t") (length 2) are converted to 1 byte + - a single \uXXXX escape (length 6) is converted to at most 3 bytes + - two \uXXXX escapes (length 12) forming an UTF-16 surrogate pair + are converted to 4 bytes + */ + lex->value.string = malloc(lex->saved_text.length + 1); + if(!lex->value.string) { + /* this is not very nice, since TOKEN_INVALID is returned */ + goto out; + } + + /* the target */ + t = lex->value.string; + + /* + 1 to skip the " */ + p = strbuffer_value(&lex->saved_text) + 1; + + while(*p != '"') { + if(*p == '\\') { + p++; + if(*p == 'u') { + char buffer[4]; + int length; + int32_t value; + + value = decode_unicode_escape(p); + p += 5; + + if(0xD800 <= value && value <= 0xDBFF) { + /* surrogate pair */ + if(*p == '\\' && *(p + 1) == 'u') { + int32_t value2 = decode_unicode_escape(++p); + p += 5; + + if(0xDC00 <= value2 && value2 <= 0xDFFF) { + /* valid second surrogate */ + value = + ((value - 0xD800) << 10) + + (value2 - 0xDC00) + + 0x10000; + } + else { + /* invalid second surrogate */ + error_set(error, lex, + "invalid Unicode '\\u%04X\\u%04X'", + value, value2); + goto out; + } + } + else { + /* no second surrogate */ + error_set(error, lex, "invalid Unicode '\\u%04X'", + value); + goto out; + } + } + else if(0xDC00 <= value && value <= 0xDFFF) { + error_set(error, lex, "invalid Unicode '\\u%04X'", value); + goto out; + } + else if(value == 0) + { + error_set(error, lex, "\\u0000 is not allowed"); + goto out; + } + + if(utf8_encode(value, buffer, &length)) + assert(0); + + memcpy(t, buffer, length); + t += length; + } + else { + switch(*p) { + case '"': case '\\': case '/': + *t = *p; break; + case 'b': *t = '\b'; break; + case 'f': *t = '\f'; break; + case 'n': *t = '\n'; break; + case 'r': *t = '\r'; break; + case 't': *t = '\t'; break; + default: assert(0); + } + t++; + p++; + } + } + else + *(t++) = *(p++); + } + *t = '\0'; + lex->token = TOKEN_STRING; + return; + +out: + free(lex->value.string); +} + +static int lex_scan_number(lex_t *lex, char c, json_error_t *error) +{ + const char *saved_text; + char *end; + double value; + + lex->token = TOKEN_INVALID; + + if(c == '-') + c = lex_get_save(lex, error); + + if(c == '0') { + c = lex_get_save(lex, error); + if(isdigit(c)) { + lex_unget_unsave(lex, c); + goto out; + } + } + else if(isdigit(c)) { + c = lex_get_save(lex, error); + while(isdigit(c)) + c = lex_get_save(lex, error); + } + else { + lex_unget_unsave(lex, c); + goto out; + } + + if(c != '.' && c != 'E' && c != 'e') { + long value; + + lex_unget_unsave(lex, c); + + saved_text = strbuffer_value(&lex->saved_text); + value = strtol(saved_text, &end, 10); + assert(end == saved_text + lex->saved_text.length); + + if((value == LONG_MAX && errno == ERANGE) || value > INT_MAX) { + error_set(error, lex, "too big integer"); + goto out; + } + else if((value == LONG_MIN && errno == ERANGE) || value < INT_MIN) { + error_set(error, lex, "too big negative integer"); + goto out; + } + + lex->token = TOKEN_INTEGER; + lex->value.integer = (int)value; + return 0; + } + + if(c == '.') { + c = lex_get(lex, error); + if(!isdigit(c)) + goto out; + lex_save(lex, c); + + c = lex_get_save(lex, error); + while(isdigit(c)) + c = lex_get_save(lex, error); + } + + if(c == 'E' || c == 'e') { + c = lex_get_save(lex, error); + if(c == '+' || c == '-') + c = lex_get_save(lex, error); + + if(!isdigit(c)) { + lex_unget_unsave(lex, c); + goto out; + } + + c = lex_get_save(lex, error); + while(isdigit(c)) + c = lex_get_save(lex, error); + } + + lex_unget_unsave(lex, c); + + saved_text = strbuffer_value(&lex->saved_text); + value = strtod(saved_text, &end); + assert(end == saved_text + lex->saved_text.length); + + if(errno == ERANGE && value != 0) { + error_set(error, lex, "real number overflow"); + goto out; + } + + lex->token = TOKEN_REAL; + lex->value.real = value; + return 0; + +out: + return -1; +} + +static int lex_scan(lex_t *lex, json_error_t *error) +{ + char c; + + strbuffer_clear(&lex->saved_text); + + if(lex->token == TOKEN_STRING) { + free(lex->value.string); + lex->value.string = NULL; + } + + c = lex_get(lex, error); + while(c == ' ' || c == '\t' || c == '\n' || c == '\r') + { + if(c == '\n') + lex->line++; + + c = lex_get(lex, error); + } + + if(c == (char)EOF) { + if(lex_eof(lex)) + lex->token = TOKEN_EOF; + else + lex->token = TOKEN_INVALID; + goto out; + } + + lex_save(lex, c); + + if(c == '{' || c == '}' || c == '[' || c == ']' || c == ':' || c == ',') + lex->token = c; + + else if(c == '"') + lex_scan_string(lex, error); + + else if(isdigit(c) || c == '-') { + if(lex_scan_number(lex, c, error)) + goto out; + } + + else if(isupper(c) || islower(c)) { + /* eat up the whole identifier for clearer error messages */ + const char *saved_text; + + c = lex_get_save(lex, error); + while(isupper(c) || islower(c)) + c = lex_get_save(lex, error); + lex_unget_unsave(lex, c); + + saved_text = strbuffer_value(&lex->saved_text); + + if(strcmp(saved_text, "true") == 0) + lex->token = TOKEN_TRUE; + else if(strcmp(saved_text, "false") == 0) + lex->token = TOKEN_FALSE; + else if(strcmp(saved_text, "null") == 0) + lex->token = TOKEN_NULL; + else + lex->token = TOKEN_INVALID; + } + + else { + /* save the rest of the input UTF-8 sequence to get an error + message of valid UTF-8 */ + lex_save_cached(lex); + lex->token = TOKEN_INVALID; + } + +out: + return lex->token; +} + +static char *lex_steal_string(lex_t *lex) +{ + char *result = NULL; + if(lex->token == TOKEN_STRING) + { + result = lex->value.string; + lex->value.string = NULL; + } + return result; +} + +static int lex_init(lex_t *lex, get_func get, eof_func eof, void *data) +{ + stream_init(&lex->stream, get, eof, data); + if(strbuffer_init(&lex->saved_text)) + return -1; + + lex->token = TOKEN_INVALID; + lex->line = 1; + + return 0; +} + +static void lex_close(lex_t *lex) +{ + if(lex->token == TOKEN_STRING) + free(lex->value.string); + strbuffer_close(&lex->saved_text); +} + + +/*** parser ***/ + +static json_t *parse_value(lex_t *lex, json_error_t *error); + +static json_t *parse_object(lex_t *lex, json_error_t *error) +{ + json_t *object = json_object(); + if(!object) + return NULL; + + lex_scan(lex, error); + if(lex->token == '}') + return object; + + while(1) { + char *key; + json_t *value; + + if(lex->token != TOKEN_STRING) { + error_set(error, lex, "string or '}' expected"); + goto error; + } + + key = lex_steal_string(lex); + if(!key) + return NULL; + + lex_scan(lex, error); + if(lex->token != ':') { + free(key); + error_set(error, lex, "':' expected"); + goto error; + } + + lex_scan(lex, error); + value = parse_value(lex, error); + if(!value) { + free(key); + goto error; + } + + if(json_object_set_nocheck(object, key, value)) { + free(key); + json_decref(value); + goto error; + } + + json_decref(value); + free(key); + + lex_scan(lex, error); + if(lex->token != ',') + break; + + lex_scan(lex, error); + } + + if(lex->token != '}') { + error_set(error, lex, "'}' expected"); + goto error; + } + + return object; + +error: + json_decref(object); + return NULL; +} + +static json_t *parse_array(lex_t *lex, json_error_t *error) +{ + json_t *array = json_array(); + if(!array) + return NULL; + + lex_scan(lex, error); + if(lex->token == ']') + return array; + + while(lex->token) { + json_t *elem = parse_value(lex, error); + if(!elem) + goto error; + + if(json_array_append(array, elem)) { + json_decref(elem); + goto error; + } + json_decref(elem); + + lex_scan(lex, error); + if(lex->token != ',') + break; + + lex_scan(lex, error); + } + + if(lex->token != ']') { + error_set(error, lex, "']' expected"); + goto error; + } + + return array; + +error: + json_decref(array); + return NULL; +} + +static json_t *parse_value(lex_t *lex, json_error_t *error) +{ + json_t *json; + + switch(lex->token) { + case TOKEN_STRING: { + json = json_string_nocheck(lex->value.string); + break; + } + + case TOKEN_INTEGER: { + json = json_integer(lex->value.integer); + break; + } + + case TOKEN_REAL: { + json = json_real(lex->value.real); + break; + } + + case TOKEN_TRUE: + json = json_true(); + break; + + case TOKEN_FALSE: + json = json_false(); + break; + + case TOKEN_NULL: + json = json_null(); + break; + + case '{': + json = parse_object(lex, error); + break; + + case '[': + json = parse_array(lex, error); + break; + + case TOKEN_INVALID: + error_set(error, lex, "invalid token"); + return NULL; + + default: + error_set(error, lex, "unexpected token"); + return NULL; + } + + if(!json) + return NULL; + + return json; +} + +static json_t *parse_json(lex_t *lex, json_error_t *error) +{ + error_init(error); + + lex_scan(lex, error); + if(lex->token != '[' && lex->token != '{') { + error_set(error, lex, "'[' or '{' expected"); + return NULL; + } + + return parse_value(lex, error); +} + +typedef struct +{ + const char *data; + int pos; +} string_data_t; + +static int string_get(void *data) +{ + char c; + string_data_t *stream = (string_data_t *)data; + c = stream->data[stream->pos]; + if(c == '\0') + return EOF; + else + { + stream->pos++; + return c; + } +} + +static int string_eof(void *data) +{ + string_data_t *stream = (string_data_t *)data; + return (stream->data[stream->pos] == '\0'); +} + +json_t *json_loads(const char *string, json_error_t *error) +{ + lex_t lex; + json_t *result; + + string_data_t stream_data = { + .data = string, + .pos = 0 + }; + + if(lex_init(&lex, string_get, string_eof, (void *)&stream_data)) + return NULL; + + result = parse_json(&lex, error); + if(!result) + goto out; + + lex_scan(&lex, error); + if(lex.token != TOKEN_EOF) { + error_set(error, &lex, "end of file expected"); + json_decref(result); + result = NULL; + } + +out: + lex_close(&lex); + return result; +} + +json_t *json_loadf(FILE *input, json_error_t *error) +{ + lex_t lex; + json_t *result; + + if(lex_init(&lex, (get_func)fgetc, (eof_func)feof, input)) + return NULL; + + result = parse_json(&lex, error); + if(!result) + goto out; + + lex_scan(&lex, error); + if(lex.token != TOKEN_EOF) { + error_set(error, &lex, "end of file expected"); + json_decref(result); + result = NULL; + } + +out: + lex_close(&lex); + return result; +} + +json_t *json_load_file(const char *path, json_error_t *error) +{ + json_t *result; + FILE *fp; + + error_init(error); + + fp = fopen(path, "r"); + if(!fp) + { + error_set(error, NULL, "unable to open %s: %s", + path, strerror(errno)); + return NULL; + } + + result = json_loadf(fp, error); + + fclose(fp); + return result; +} diff --git a/compat/jansson/strbuffer.c b/compat/jansson/strbuffer.c new file mode 100644 index 00000000..34960247 --- /dev/null +++ b/compat/jansson/strbuffer.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2009, 2010 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#define _GNU_SOURCE +#include +#include +#include "strbuffer.h" +#include "util.h" + +#define STRBUFFER_MIN_SIZE 16 +#define STRBUFFER_FACTOR 2 + +int strbuffer_init(strbuffer_t *strbuff) +{ + strbuff->size = STRBUFFER_MIN_SIZE; + strbuff->length = 0; + + strbuff->value = malloc(strbuff->size); + if(!strbuff->value) + return -1; + + /* initialize to empty */ + strbuff->value[0] = '\0'; + return 0; +} + +void strbuffer_close(strbuffer_t *strbuff) +{ + free(strbuff->value); + strbuff->size = 0; + strbuff->length = 0; + strbuff->value = NULL; +} + +void strbuffer_clear(strbuffer_t *strbuff) +{ + strbuff->length = 0; + strbuff->value[0] = '\0'; +} + +const char *strbuffer_value(const strbuffer_t *strbuff) +{ + return strbuff->value; +} + +char *strbuffer_steal_value(strbuffer_t *strbuff) +{ + char *result = strbuff->value; + strbuffer_init(strbuff); + return result; +} + +int strbuffer_append(strbuffer_t *strbuff, const char *string) +{ + return strbuffer_append_bytes(strbuff, string, strlen(string)); +} + +int strbuffer_append_byte(strbuffer_t *strbuff, char byte) +{ + return strbuffer_append_bytes(strbuff, &byte, 1); +} + +int strbuffer_append_bytes(strbuffer_t *strbuff, const char *data, int size) +{ + if(strbuff->length + size >= strbuff->size) + { + strbuff->size = max(strbuff->size * STRBUFFER_FACTOR, + strbuff->length + size + 1); + + strbuff->value = realloc(strbuff->value, strbuff->size); + if(!strbuff->value) + return -1; + } + + memcpy(strbuff->value + strbuff->length, data, size); + strbuff->length += size; + strbuff->value[strbuff->length] = '\0'; + + return 0; +} + +char strbuffer_pop(strbuffer_t *strbuff) +{ + if(strbuff->length > 0) { + char c = strbuff->value[--strbuff->length]; + strbuff->value[strbuff->length] = '\0'; + return c; + } + else + return '\0'; +} diff --git a/compat/jansson/strbuffer.h b/compat/jansson/strbuffer.h new file mode 100644 index 00000000..f4c5f771 --- /dev/null +++ b/compat/jansson/strbuffer.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2009, 2010 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#ifndef STRBUFFER_H +#define STRBUFFER_H + +typedef struct { + char *value; + int length; /* bytes used */ + int size; /* bytes allocated */ +} strbuffer_t; + +int strbuffer_init(strbuffer_t *strbuff); +void strbuffer_close(strbuffer_t *strbuff); + +void strbuffer_clear(strbuffer_t *strbuff); + +const char *strbuffer_value(const strbuffer_t *strbuff); +char *strbuffer_steal_value(strbuffer_t *strbuff); + +int strbuffer_append(strbuffer_t *strbuff, const char *string); +int strbuffer_append_byte(strbuffer_t *strbuff, char byte); +int strbuffer_append_bytes(strbuffer_t *strbuff, const char *data, int size); + +char strbuffer_pop(strbuffer_t *strbuff); + +#endif diff --git a/compat/jansson/utf.c b/compat/jansson/utf.c new file mode 100644 index 00000000..92484d02 --- /dev/null +++ b/compat/jansson/utf.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2009, 2010 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#include +#include "utf.h" + +int utf8_encode(int32_t codepoint, char *buffer, int *size) +{ + if(codepoint < 0) + return -1; + else if(codepoint < 0x80) + { + buffer[0] = (char)codepoint; + *size = 1; + } + else if(codepoint < 0x800) + { + buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6); + buffer[1] = 0x80 + ((codepoint & 0x03F)); + *size = 2; + } + else if(codepoint < 0x10000) + { + buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12); + buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6); + buffer[2] = 0x80 + ((codepoint & 0x003F)); + *size = 3; + } + else if(codepoint <= 0x10FFFF) + { + buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18); + buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12); + buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6); + buffer[3] = 0x80 + ((codepoint & 0x00003F)); + *size = 4; + } + else + return -1; + + return 0; +} + +int utf8_check_first(char byte) +{ + unsigned char u = (unsigned char)byte; + + if(u < 0x80) + return 1; + + if(0x80 <= u && u <= 0xBF) { + /* second, third or fourth byte of a multi-byte + sequence, i.e. a "continuation byte" */ + return 0; + } + else if(u == 0xC0 || u == 0xC1) { + /* overlong encoding of an ASCII byte */ + return 0; + } + else if(0xC2 <= u && u <= 0xDF) { + /* 2-byte sequence */ + return 2; + } + + else if(0xE0 <= u && u <= 0xEF) { + /* 3-byte sequence */ + return 3; + } + else if(0xF0 <= u && u <= 0xF4) { + /* 4-byte sequence */ + return 4; + } + else { /* u >= 0xF5 */ + /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid + UTF-8 */ + return 0; + } +} + +int utf8_check_full(const char *buffer, int size, int32_t *codepoint) +{ + int i; + int32_t value = 0; + unsigned char u = (unsigned char)buffer[0]; + + if(size == 2) + { + value = u & 0x1F; + } + else if(size == 3) + { + value = u & 0xF; + } + else if(size == 4) + { + value = u & 0x7; + } + else + return 0; + + for(i = 1; i < size; i++) + { + u = (unsigned char)buffer[i]; + + if(u < 0x80 || u > 0xBF) { + /* not a continuation byte */ + return 0; + } + + value = (value << 6) + (u & 0x3F); + } + + if(value > 0x10FFFF) { + /* not in Unicode range */ + return 0; + } + + else if(0xD800 <= value && value <= 0xDFFF) { + /* invalid code point (UTF-16 surrogate halves) */ + return 0; + } + + else if((size == 2 && value < 0x80) || + (size == 3 && value < 0x800) || + (size == 4 && value < 0x10000)) { + /* overlong encoding */ + return 0; + } + + if(codepoint) + *codepoint = value; + + return 1; +} + +const char *utf8_iterate(const char *buffer, int32_t *codepoint) +{ + int count; + int32_t value; + + if(!*buffer) + return buffer; + + count = utf8_check_first(buffer[0]); + if(count <= 0) + return NULL; + + if(count == 1) + value = (unsigned char)buffer[0]; + else + { + if(!utf8_check_full(buffer, count, &value)) + return NULL; + } + + if(codepoint) + *codepoint = value; + + return buffer + count; +} + +int utf8_check_string(const char *string, int length) +{ + int i; + + if(length == -1) + length = strlen(string); + + for(i = 0; i < length; i++) + { + int count = utf8_check_first(string[i]); + if(count == 0) + return 0; + else if(count > 1) + { + if(i + count > length) + return 0; + + if(!utf8_check_full(&string[i], count, NULL)) + return 0; + + i += count - 1; + } + } + + return 1; +} diff --git a/compat/jansson/utf.h b/compat/jansson/utf.h new file mode 100644 index 00000000..d0ae6e93 --- /dev/null +++ b/compat/jansson/utf.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2009, 2010 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#ifndef UTF_H +#define UTF_H + +#include + +#ifdef HAVE_INTTYPES_H +/* inttypes.h includes stdint.h in a standard environment, so there's +no need to include stdint.h separately. If inttypes.h doesn't define +int32_t, it's defined in config.h. */ +#include +#endif + +int utf8_encode(int codepoint, char *buffer, int *size); + +int utf8_check_first(char byte); +int utf8_check_full(const char *buffer, int size, int32_t *codepoint); +const char *utf8_iterate(const char *buffer, int32_t *codepoint); + +int utf8_check_string(const char *string, int length); + +#endif diff --git a/compat/jansson/util.h b/compat/jansson/util.h new file mode 100644 index 00000000..06a547b8 --- /dev/null +++ b/compat/jansson/util.h @@ -0,0 +1,13 @@ +/* + * Copyright (c) 2009, 2010 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#ifndef UTIL_H +#define UTIL_H + +#define max(a, b) ((a) > (b) ? (a) : (b)) + +#endif diff --git a/compat/jansson/value.c b/compat/jansson/value.c new file mode 100644 index 00000000..e024fdb1 --- /dev/null +++ b/compat/jansson/value.c @@ -0,0 +1,976 @@ +/* + * Copyright (c) 2009, 2010 Petri Lehtinen + * + * Jansson is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See LICENSE for details. + */ + +#define _GNU_SOURCE + +#include + +#include +#include + +#include +#include "hashtable.h" +#include "jansson_private.h" +#include "utf.h" +#include "util.h" + + +static inline void json_init(json_t *json, json_type type) +{ + json->type = type; + json->refcount = 1; +} + + +/*** object ***/ + +/* This macro just returns a pointer that's a few bytes backwards from + string. This makes it possible to pass a pointer to object_key_t + when only the string inside it is used, without actually creating + an object_key_t instance. */ +#define string_to_key(string) container_of(string, object_key_t, key) + +static unsigned int hash_key(const void *ptr) +{ + const char *str = ((const object_key_t *)ptr)->key; + + unsigned int hash = 5381; + unsigned int c; + + while((c = (unsigned int)*str)) + { + hash = ((hash << 5) + hash) + c; + str++; + } + + return hash; +} + +static int key_equal(const void *ptr1, const void *ptr2) +{ + return strcmp(((const object_key_t *)ptr1)->key, + ((const object_key_t *)ptr2)->key) == 0; +} + +static void value_decref(void *value) +{ + json_decref((json_t *)value); +} + +json_t *json_object(void) +{ + json_object_t *object = malloc(sizeof(json_object_t)); + if(!object) + return NULL; + json_init(&object->json, JSON_OBJECT); + + if(hashtable_init(&object->hashtable, hash_key, key_equal, + free, value_decref)) + { + free(object); + return NULL; + } + + object->serial = 0; + object->visited = 0; + + return &object->json; +} + +static void json_delete_object(json_object_t *object) +{ + hashtable_close(&object->hashtable); + free(object); +} + +unsigned int json_object_size(const json_t *json) +{ + json_object_t *object; + + if(!json_is_object(json)) + return -1; + + object = json_to_object(json); + return object->hashtable.size; +} + +json_t *json_object_get(const json_t *json, const char *key) +{ + json_object_t *object; + + if(!json_is_object(json)) + return NULL; + + object = json_to_object(json); + return hashtable_get(&object->hashtable, string_to_key(key)); +} + +int json_object_set_new_nocheck(json_t *json, const char *key, json_t *value) +{ + json_object_t *object; + object_key_t *k; + + if(!key || !value) + return -1; + + if(!json_is_object(json) || json == value) + { + json_decref(value); + return -1; + } + object = json_to_object(json); + + k = malloc(sizeof(object_key_t) + strlen(key) + 1); + if(!k) + return -1; + + k->serial = object->serial++; + strcpy(k->key, key); + + if(hashtable_set(&object->hashtable, k, value)) + { + json_decref(value); + return -1; + } + + return 0; +} + +int json_object_set_new(json_t *json, const char *key, json_t *value) +{ + if(!key || !utf8_check_string(key, -1)) + { + json_decref(value); + return -1; + } + + return json_object_set_new_nocheck(json, key, value); +} + +int json_object_del(json_t *json, const char *key) +{ + json_object_t *object; + + if(!json_is_object(json)) + return -1; + + object = json_to_object(json); + return hashtable_del(&object->hashtable, string_to_key(key)); +} + +int json_object_clear(json_t *json) +{ + json_object_t *object; + + if(!json_is_object(json)) + return -1; + + object = json_to_object(json); + hashtable_clear(&object->hashtable); + + return 0; +} + +int json_object_update(json_t *object, json_t *other) +{ + void *iter; + + if(!json_is_object(object) || !json_is_object(other)) + return -1; + + iter = json_object_iter(other); + while(iter) { + const char *key; + json_t *value; + + key = json_object_iter_key(iter); + value = json_object_iter_value(iter); + + if(json_object_set_nocheck(object, key, value)) + return -1; + + iter = json_object_iter_next(other, iter); + } + + return 0; +} + +void *json_object_iter(json_t *json) +{ + json_object_t *object; + + if(!json_is_object(json)) + return NULL; + + object = json_to_object(json); + return hashtable_iter(&object->hashtable); +} + +void *json_object_iter_at(json_t *json, const char *key) +{ + json_object_t *object; + + if(!key || !json_is_object(json)) + return NULL; + + object = json_to_object(json); + return hashtable_iter_at(&object->hashtable, string_to_key(key)); +} + +void *json_object_iter_next(json_t *json, void *iter) +{ + json_object_t *object; + + if(!json_is_object(json) || iter == NULL) + return NULL; + + object = json_to_object(json); + return hashtable_iter_next(&object->hashtable, iter); +} + +const object_key_t *jsonp_object_iter_fullkey(void *iter) +{ + if(!iter) + return NULL; + + return hashtable_iter_key(iter); +} + +const char *json_object_iter_key(void *iter) +{ + if(!iter) + return NULL; + + return jsonp_object_iter_fullkey(iter)->key; +} + +json_t *json_object_iter_value(void *iter) +{ + if(!iter) + return NULL; + + return (json_t *)hashtable_iter_value(iter); +} + +int json_object_iter_set_new(json_t *json, void *iter, json_t *value) +{ + json_object_t *object; + + if(!json_is_object(json) || !iter || !value) + return -1; + + object = json_to_object(json); + hashtable_iter_set(&object->hashtable, iter, value); + + return 0; +} + +static int json_object_equal(json_t *object1, json_t *object2) +{ + void *iter; + + if(json_object_size(object1) != json_object_size(object2)) + return 0; + + iter = json_object_iter(object1); + while(iter) + { + const char *key; + json_t *value1, *value2; + + key = json_object_iter_key(iter); + value1 = json_object_iter_value(iter); + value2 = json_object_get(object2, key); + + if(!json_equal(value1, value2)) + return 0; + + iter = json_object_iter_next(object1, iter); + } + + return 1; +} + +static json_t *json_object_copy(json_t *object) +{ + json_t *result; + void *iter; + + result = json_object(); + if(!result) + return NULL; + + iter = json_object_iter(object); + while(iter) + { + const char *key; + json_t *value; + + key = json_object_iter_key(iter); + value = json_object_iter_value(iter); + json_object_set_nocheck(result, key, value); + + iter = json_object_iter_next(object, iter); + } + + return result; +} + +static json_t *json_object_deep_copy(json_t *object) +{ + json_t *result; + void *iter; + + result = json_object(); + if(!result) + return NULL; + + iter = json_object_iter(object); + while(iter) + { + const char *key; + json_t *value; + + key = json_object_iter_key(iter); + value = json_object_iter_value(iter); + json_object_set_new_nocheck(result, key, json_deep_copy(value)); + + iter = json_object_iter_next(object, iter); + } + + return result; +} + + +/*** array ***/ + +json_t *json_array(void) +{ + json_array_t *array = malloc(sizeof(json_array_t)); + if(!array) + return NULL; + json_init(&array->json, JSON_ARRAY); + + array->entries = 0; + array->size = 8; + + array->table = malloc(array->size * sizeof(json_t *)); + if(!array->table) { + free(array); + return NULL; + } + + array->visited = 0; + + return &array->json; +} + +static void json_delete_array(json_array_t *array) +{ + unsigned int i; + + for(i = 0; i < array->entries; i++) + json_decref(array->table[i]); + + free(array->table); + free(array); +} + +unsigned int json_array_size(const json_t *json) +{ + if(!json_is_array(json)) + return 0; + + return json_to_array(json)->entries; +} + +json_t *json_array_get(const json_t *json, unsigned int index) +{ + json_array_t *array; + if(!json_is_array(json)) + return NULL; + array = json_to_array(json); + + if(index >= array->entries) + return NULL; + + return array->table[index]; +} + +int json_array_set_new(json_t *json, unsigned int index, json_t *value) +{ + json_array_t *array; + + if(!value) + return -1; + + if(!json_is_array(json) || json == value) + { + json_decref(value); + return -1; + } + array = json_to_array(json); + + if(index >= array->entries) + { + json_decref(value); + return -1; + } + + json_decref(array->table[index]); + array->table[index] = value; + + return 0; +} + +static void array_move(json_array_t *array, unsigned int dest, + unsigned int src, unsigned int count) +{ + memmove(&array->table[dest], &array->table[src], count * sizeof(json_t *)); +} + +static void array_copy(json_t **dest, unsigned int dpos, + json_t **src, unsigned int spos, + unsigned int count) +{ + memcpy(&dest[dpos], &src[spos], count * sizeof(json_t *)); +} + +static json_t **json_array_grow(json_array_t *array, + unsigned int amount, + int copy) +{ + unsigned int new_size; + json_t **old_table, **new_table; + + if(array->entries + amount <= array->size) + return array->table; + + old_table = array->table; + + new_size = max(array->size + amount, array->size * 2); + new_table = malloc(new_size * sizeof(json_t *)); + if(!new_table) + return NULL; + + array->size = new_size; + array->table = new_table; + + if(copy) { + array_copy(array->table, 0, old_table, 0, array->entries); + free(old_table); + return array->table; + } + + return old_table; +} + +int json_array_append_new(json_t *json, json_t *value) +{ + json_array_t *array; + + if(!value) + return -1; + + if(!json_is_array(json) || json == value) + { + json_decref(value); + return -1; + } + array = json_to_array(json); + + if(!json_array_grow(array, 1, 1)) { + json_decref(value); + return -1; + } + + array->table[array->entries] = value; + array->entries++; + + return 0; +} + +int json_array_insert_new(json_t *json, unsigned int index, json_t *value) +{ + json_array_t *array; + json_t **old_table; + + if(!value) + return -1; + + if(!json_is_array(json) || json == value) { + json_decref(value); + return -1; + } + array = json_to_array(json); + + if(index > array->entries) { + json_decref(value); + return -1; + } + + old_table = json_array_grow(array, 1, 0); + if(!old_table) { + json_decref(value); + return -1; + } + + if(old_table != array->table) { + array_copy(array->table, 0, old_table, 0, index); + array_copy(array->table, index + 1, old_table, index, + array->entries - index); + free(old_table); + } + else + array_move(array, index + 1, index, array->entries - index); + + array->table[index] = value; + array->entries++; + + return 0; +} + +int json_array_remove(json_t *json, unsigned int index) +{ + json_array_t *array; + + if(!json_is_array(json)) + return -1; + array = json_to_array(json); + + if(index >= array->entries) + return -1; + + json_decref(array->table[index]); + + array_move(array, index, index + 1, array->entries - index); + array->entries--; + + return 0; +} + +int json_array_clear(json_t *json) +{ + json_array_t *array; + unsigned int i; + + if(!json_is_array(json)) + return -1; + array = json_to_array(json); + + for(i = 0; i < array->entries; i++) + json_decref(array->table[i]); + + array->entries = 0; + return 0; +} + +int json_array_extend(json_t *json, json_t *other_json) +{ + json_array_t *array, *other; + unsigned int i; + + if(!json_is_array(json) || !json_is_array(other_json)) + return -1; + array = json_to_array(json); + other = json_to_array(other_json); + + if(!json_array_grow(array, other->entries, 1)) + return -1; + + for(i = 0; i < other->entries; i++) + json_incref(other->table[i]); + + array_copy(array->table, array->entries, other->table, 0, other->entries); + + array->entries += other->entries; + return 0; +} + +static int json_array_equal(json_t *array1, json_t *array2) +{ + unsigned int i, size; + + size = json_array_size(array1); + if(size != json_array_size(array2)) + return 0; + + for(i = 0; i < size; i++) + { + json_t *value1, *value2; + + value1 = json_array_get(array1, i); + value2 = json_array_get(array2, i); + + if(!json_equal(value1, value2)) + return 0; + } + + return 1; +} + +static json_t *json_array_copy(json_t *array) +{ + json_t *result; + unsigned int i; + + result = json_array(); + if(!result) + return NULL; + + for(i = 0; i < json_array_size(array); i++) + json_array_append(result, json_array_get(array, i)); + + return result; +} + +static json_t *json_array_deep_copy(json_t *array) +{ + json_t *result; + unsigned int i; + + result = json_array(); + if(!result) + return NULL; + + for(i = 0; i < json_array_size(array); i++) + json_array_append_new(result, json_deep_copy(json_array_get(array, i))); + + return result; +} + +/*** string ***/ + +json_t *json_string_nocheck(const char *value) +{ + json_string_t *string; + + if(!value) + return NULL; + + string = malloc(sizeof(json_string_t)); + if(!string) + return NULL; + json_init(&string->json, JSON_STRING); + + string->value = strdup(value); + if(!string->value) { + free(string); + return NULL; + } + + return &string->json; +} + +json_t *json_string(const char *value) +{ + if(!value || !utf8_check_string(value, -1)) + return NULL; + + return json_string_nocheck(value); +} + +const char *json_string_value(const json_t *json) +{ + if(!json_is_string(json)) + return NULL; + + return json_to_string(json)->value; +} + +int json_string_set_nocheck(json_t *json, const char *value) +{ + char *dup; + json_string_t *string; + + dup = strdup(value); + if(!dup) + return -1; + + string = json_to_string(json); + free(string->value); + string->value = dup; + + return 0; +} + +int json_string_set(json_t *json, const char *value) +{ + if(!value || !utf8_check_string(value, -1)) + return -1; + + return json_string_set_nocheck(json, value); +} + +static void json_delete_string(json_string_t *string) +{ + free(string->value); + free(string); +} + +static int json_string_equal(json_t *string1, json_t *string2) +{ + return strcmp(json_string_value(string1), json_string_value(string2)) == 0; +} + +static json_t *json_string_copy(json_t *string) +{ + return json_string_nocheck(json_string_value(string)); +} + + +/*** integer ***/ + +json_t *json_integer(int value) +{ + json_integer_t *integer = malloc(sizeof(json_integer_t)); + if(!integer) + return NULL; + json_init(&integer->json, JSON_INTEGER); + + integer->value = value; + return &integer->json; +} + +int json_integer_value(const json_t *json) +{ + if(!json_is_integer(json)) + return 0; + + return json_to_integer(json)->value; +} + +int json_integer_set(json_t *json, int value) +{ + if(!json_is_integer(json)) + return -1; + + json_to_integer(json)->value = value; + + return 0; +} + +static void json_delete_integer(json_integer_t *integer) +{ + free(integer); +} + +static int json_integer_equal(json_t *integer1, json_t *integer2) +{ + return json_integer_value(integer1) == json_integer_value(integer2); +} + +static json_t *json_integer_copy(json_t *integer) +{ + return json_integer(json_integer_value(integer)); +} + + +/*** real ***/ + +json_t *json_real(double value) +{ + json_real_t *real = malloc(sizeof(json_real_t)); + if(!real) + return NULL; + json_init(&real->json, JSON_REAL); + + real->value = value; + return &real->json; +} + +double json_real_value(const json_t *json) +{ + if(!json_is_real(json)) + return 0; + + return json_to_real(json)->value; +} + +int json_real_set(json_t *json, double value) +{ + if(!json_is_real(json)) + return 0; + + json_to_real(json)->value = value; + + return 0; +} + +static void json_delete_real(json_real_t *real) +{ + free(real); +} + +static int json_real_equal(json_t *real1, json_t *real2) +{ + return json_real_value(real1) == json_real_value(real2); +} + +static json_t *json_real_copy(json_t *real) +{ + return json_real(json_real_value(real)); +} + + +/*** number ***/ + +double json_number_value(const json_t *json) +{ + if(json_is_integer(json)) + return json_integer_value(json); + else if(json_is_real(json)) + return json_real_value(json); + else + return 0.0; +} + + +/*** simple values ***/ + +json_t *json_true(void) +{ + static json_t the_true = { + .type = JSON_TRUE, + .refcount = (unsigned int)-1 + }; + return &the_true; +} + + +json_t *json_false(void) +{ + static json_t the_false = { + .type = JSON_FALSE, + .refcount = (unsigned int)-1 + }; + return &the_false; +} + + +json_t *json_null(void) +{ + static json_t the_null = { + .type = JSON_NULL, + .refcount = (unsigned int)-1 + }; + return &the_null; +} + + +/*** deletion ***/ + +void json_delete(json_t *json) +{ + if(json_is_object(json)) + json_delete_object(json_to_object(json)); + + else if(json_is_array(json)) + json_delete_array(json_to_array(json)); + + else if(json_is_string(json)) + json_delete_string(json_to_string(json)); + + else if(json_is_integer(json)) + json_delete_integer(json_to_integer(json)); + + else if(json_is_real(json)) + json_delete_real(json_to_real(json)); + + /* json_delete is not called for true, false or null */ +} + + +/*** equality ***/ + +int json_equal(json_t *json1, json_t *json2) +{ + if(!json1 || !json2) + return 0; + + if(json_typeof(json1) != json_typeof(json2)) + return 0; + + /* this covers true, false and null as they are singletons */ + if(json1 == json2) + return 1; + + if(json_is_object(json1)) + return json_object_equal(json1, json2); + + if(json_is_array(json1)) + return json_array_equal(json1, json2); + + if(json_is_string(json1)) + return json_string_equal(json1, json2); + + if(json_is_integer(json1)) + return json_integer_equal(json1, json2); + + if(json_is_real(json1)) + return json_real_equal(json1, json2); + + return 0; +} + + +/*** copying ***/ + +json_t *json_copy(json_t *json) +{ + if(!json) + return NULL; + + if(json_is_object(json)) + return json_object_copy(json); + + if(json_is_array(json)) + return json_array_copy(json); + + if(json_is_string(json)) + return json_string_copy(json); + + if(json_is_integer(json)) + return json_integer_copy(json); + + if(json_is_real(json)) + return json_real_copy(json); + + if(json_is_true(json) || json_is_false(json) || json_is_null(json)) + return json; + + return NULL; +} + +json_t *json_deep_copy(json_t *json) +{ + if(!json) + return NULL; + + if(json_is_object(json)) + return json_object_deep_copy(json); + + if(json_is_array(json)) + return json_array_deep_copy(json); + + /* for the rest of the types, deep copying doesn't differ from + shallow copying */ + + if(json_is_string(json)) + return json_string_copy(json); + + if(json_is_integer(json)) + return json_integer_copy(json); + + if(json_is_real(json)) + return json_real_copy(json); + + if(json_is_true(json) || json_is_false(json) || json_is_null(json)) + return json; + + return NULL; +} diff --git a/configure.ac b/configure.ac new file mode 100644 index 00000000..719ec8ee --- /dev/null +++ b/configure.ac @@ -0,0 +1,125 @@ +AC_INIT([cpuminer], [2.3.3]) + +AC_PREREQ([2.59c]) +AC_CANONICAL_SYSTEM +AC_CONFIG_SRCDIR([cpu-miner.c]) +AM_INIT_AUTOMAKE([gnu]) +AC_CONFIG_HEADERS([cpuminer-config.h]) + +dnl Make sure anyone changing configure.ac/Makefile.am has a clue +AM_MAINTAINER_MODE + +dnl Checks for programs +AC_PROG_CC_C99 +AC_PROG_GCC_TRADITIONAL +AM_PROG_CC_C_O +AM_PROG_AS +AC_PROG_RANLIB + +dnl Checks for header files +AC_HEADER_STDC +AC_CHECK_HEADERS([sys/endian.h sys/param.h syslog.h]) +# sys/sysctl.h requires sys/types.h on FreeBSD +# sys/sysctl.h requires sys/param.h on OpenBSD +AC_CHECK_HEADERS([sys/sysctl.h], [], [], +[#include +#ifdef HAVE_SYS_PARAM_H +#include +#endif +]) + +AC_CHECK_DECLS([be32dec, le32dec, be32enc, le32enc], [], [], +[AC_INCLUDES_DEFAULT +#ifdef HAVE_SYS_ENDIAN_H +#include +#endif +]) + +AC_FUNC_ALLOCA +AC_CHECK_FUNCS([getopt_long]) + +case $target in + i*86-*-*) + have_x86=true + ;; + x86_64-*-*|amd64-*-*) + have_x86_64=true + ;; + arm*-*-*) + have_arm=true + ;; +esac + +PTHREAD_FLAGS="-pthread" +WS2_LIBS="" + +case $target in + *-*-mingw*) + have_win32=true + PTHREAD_FLAGS="" + WS2_LIBS="-lws2_32" + ;; +esac + +if test x$have_x86 = xtrue -o x$have_x86_64 = xtrue +then + AC_MSG_CHECKING(whether we can compile AVX code) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vmovdqa %ymm0, %ymm1");])], + AC_DEFINE(USE_AVX, 1, [Define to 1 if AVX assembly is available.]) + AC_MSG_RESULT(yes) + AC_MSG_CHECKING(whether we can compile XOP code) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vprotd \$7, %xmm0, %xmm1");])], + AC_DEFINE(USE_XOP, 1, [Define to 1 if XOP assembly is available.]) + AC_MSG_RESULT(yes) + , + AC_MSG_RESULT(no) + AC_MSG_WARN([The assembler does not support the XOP instruction set.]) + ) + AC_MSG_CHECKING(whether we can compile AVX2 code) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])], + AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.]) + AC_MSG_RESULT(yes) + , + AC_MSG_RESULT(no) + AC_MSG_WARN([The assembler does not support the AVX2 instruction set.]) + ) + , + AC_MSG_RESULT(no) + AC_MSG_WARN([The assembler does not support the AVX instruction set.]) + ) +fi + +AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true) +AC_CHECK_LIB([pthread], [pthread_create], PTHREAD_LIBS="-lpthread", + AC_CHECK_LIB([pthreadGC2], [pthread_create], PTHREAD_LIBS="-lpthreadGC2", + AC_CHECK_LIB([pthreadGC1], [pthread_create], PTHREAD_LIBS="-lpthreadGC1", + AC_CHECK_LIB([pthreadGC], [pthread_create], PTHREAD_LIBS="-lpthreadGC" +)))) + +AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue]) +AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue]) +AM_CONDITIONAL([ARCH_x86], [test x$have_x86 = xtrue]) +AM_CONDITIONAL([ARCH_x86_64], [test x$have_x86_64 = xtrue]) +AM_CONDITIONAL([ARCH_ARM], [test x$have_arm = xtrue]) + +if test x$request_jansson = xtrue +then + JANSSON_LIBS="compat/jansson/libjansson.a" +else + JANSSON_LIBS=-ljansson +fi + +LIBCURL_CHECK_CONFIG(, 7.15.2, , + [AC_MSG_ERROR([Missing required libcurl >= 7.15.2])]) + +AC_SUBST(JANSSON_LIBS) +AC_SUBST(PTHREAD_FLAGS) +AC_SUBST(PTHREAD_LIBS) +AC_SUBST(WS2_LIBS) + +AC_CONFIG_FILES([ + Makefile + compat/Makefile + compat/jansson/Makefile + ]) +AC_OUTPUT diff --git a/cpu-miner.c b/cpu-miner.c new file mode 100644 index 00000000..e383ea11 --- /dev/null +++ b/cpu-miner.c @@ -0,0 +1,1516 @@ +/* + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef WIN32 +#include +#else +#include +#include +#include +#if HAVE_SYS_SYSCTL_H +#include +#if HAVE_SYS_PARAM_H +#include +#endif +#include +#endif +#endif +#include +#include +#include "compat.h" +#include "miner.h" + +#define PROGRAM_NAME "minerd" +#define LP_SCANTIME 60 + +#ifdef __linux /* Linux specific policy and affinity management */ +#include +static inline void drop_policy(void) +{ + struct sched_param param; + param.sched_priority = 0; + +#ifdef SCHED_IDLE + if (unlikely(sched_setscheduler(0, SCHED_IDLE, ¶m) == -1)) +#endif +#ifdef SCHED_BATCH + sched_setscheduler(0, SCHED_BATCH, ¶m); +#endif +} + +static inline void affine_to_cpu(int id, int cpu) +{ + cpu_set_t set; + + CPU_ZERO(&set); + CPU_SET(cpu, &set); + sched_setaffinity(0, sizeof(set), &set); +} +#elif defined(__FreeBSD__) /* FreeBSD specific policy and affinity management */ +#include +static inline void drop_policy(void) +{ +} + +static inline void affine_to_cpu(int id, int cpu) +{ + cpuset_t set; + CPU_ZERO(&set); + CPU_SET(cpu, &set); + cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, sizeof(cpuset_t), &set); +} +#else +static inline void drop_policy(void) +{ +} + +static inline void affine_to_cpu(int id, int cpu) +{ +} +#endif + +enum workio_commands { + WC_GET_WORK, + WC_SUBMIT_WORK, +}; + +struct workio_cmd { + enum workio_commands cmd; + struct thr_info *thr; + union { + struct work *work; + } u; +}; + +enum sha256_algos { + ALGO_SCRYPT, /* scrypt(1024,1,1) */ + ALGO_SHA256D, /* SHA-256d */ + ALGO_KECCAK, /* Keccak */ + ALGO_HEAVY, /* Heavy */ + ALGO_QUARK, /* Quark */ +}; + +static const char *algo_names[] = { + [ALGO_SCRYPT] = "scrypt", + [ALGO_SHA256D] = "sha256d", + [ALGO_KECCAK] = "keccak", + [ALGO_HEAVY] = "heavy", + [ALGO_QUARK] = "quark" +}; + +bool opt_debug = false; +bool opt_protocol = false; +static bool opt_benchmark = false; +bool opt_redirect = true; +bool want_longpoll = true; +bool have_longpoll = false; +bool want_stratum = true; +bool have_stratum = false; +static bool submit_old = false; +bool use_syslog = false; +static bool opt_background = false; +static bool opt_quiet = false; +static int opt_retries = -1; +static int opt_fail_pause = 30; +int opt_timeout = 0; +static int opt_scantime = 5; +static json_t *opt_config; +static const bool opt_time = true; +static enum sha256_algos opt_algo = ALGO_SCRYPT; +static int opt_n_threads; +static int num_processors; +static char *rpc_url; +static char *rpc_userpass; +static char *rpc_user, *rpc_pass; +char *opt_cert; +char *opt_proxy; +long opt_proxy_type; +struct thr_info *thr_info; +static int work_thr_id; +int longpoll_thr_id = -1; +int stratum_thr_id = -1; +struct work_restart *work_restart = NULL; +static struct stratum_ctx stratum; + +pthread_mutex_t applog_lock; +static pthread_mutex_t stats_lock; + +static unsigned long accepted_count = 0L; +static unsigned long rejected_count = 0L; +static double *thr_hashrates; + +#ifdef HAVE_GETOPT_LONG +#include +#else +struct option { + const char *name; + int has_arg; + int *flag; + int val; +}; +#endif + +static char const usage[] = "\ +Usage: " PROGRAM_NAME " [OPTIONS]\n\ +Options:\n\ + -a, --algo=ALGO specify the algorithm to use\n\ + scrypt scrypt(1024, 1, 1) (default)\n\ + sha256d SHA-256d\n\ + keccak Keccak\n\ + heavy Heavy\n\ + -o, --url=URL URL of mining server\n\ + -O, --userpass=U:P username:password pair for mining server\n\ + -u, --user=USERNAME username for mining server\n\ + -p, --pass=PASSWORD password for mining server\n\ + --cert=FILE certificate for mining server using SSL\n\ + -x, --proxy=[PROTOCOL://]HOST[:PORT] connect through a proxy\n\ + -t, --threads=N number of miner threads (default: number of processors)\n\ + -r, --retries=N number of times to retry if a network call fails\n\ + (default: retry indefinitely)\n\ + -R, --retry-pause=N time to pause between retries, in seconds (default: 30)\n\ + -T, --timeout=N timeout for long polling, in seconds (default: none)\n\ + -s, --scantime=N upper bound on time spent scanning current work when\n\ + long polling is unavailable, in seconds (default: 5)\n\ + --no-longpoll disable X-Long-Polling support\n\ + --no-stratum disable X-Stratum support\n\ + --no-redirect ignore requests to change the URL of the mining server\n\ + -q, --quiet disable per-thread hashmeter output\n\ + -D, --debug enable debug output\n\ + -P, --protocol-dump verbose dump of protocol-level activities\n" +#ifdef HAVE_SYSLOG_H +"\ + -S, --syslog use system log for output messages\n" +#endif +#ifndef WIN32 +"\ + -B, --background run the miner in the background\n" +#endif +"\ + --benchmark run in offline benchmark mode\n\ + -c, --config=FILE load a JSON-format configuration file\n\ + -V, --version display version information and exit\n\ + -h, --help display this help text and exit\n\ +"; + +static char const short_options[] = +#ifndef WIN32 + "B" +#endif +#ifdef HAVE_SYSLOG_H + "S" +#endif + "a:c:Dhp:Px:qr:R:s:t:T:o:u:O:V"; + +static struct option const options[] = { + { "algo", 1, NULL, 'a' }, +#ifndef WIN32 + { "background", 0, NULL, 'B' }, +#endif + { "benchmark", 0, NULL, 1005 }, + { "cert", 1, NULL, 1001 }, + { "config", 1, NULL, 'c' }, + { "debug", 0, NULL, 'D' }, + { "help", 0, NULL, 'h' }, + { "no-longpoll", 0, NULL, 1003 }, + { "no-redirect", 0, NULL, 1009 }, + { "no-stratum", 0, NULL, 1007 }, + { "pass", 1, NULL, 'p' }, + { "protocol-dump", 0, NULL, 'P' }, + { "proxy", 1, NULL, 'x' }, + { "quiet", 0, NULL, 'q' }, + { "retries", 1, NULL, 'r' }, + { "retry-pause", 1, NULL, 'R' }, + { "scantime", 1, NULL, 's' }, +#ifdef HAVE_SYSLOG_H + { "syslog", 0, NULL, 'S' }, +#endif + { "threads", 1, NULL, 't' }, + { "timeout", 1, NULL, 'T' }, + { "url", 1, NULL, 'o' }, + { "user", 1, NULL, 'u' }, + { "userpass", 1, NULL, 'O' }, + { "version", 0, NULL, 'V' }, + { 0, 0, 0, 0 } +}; + +struct work { + uint32_t data[32]; + uint32_t target[8]; + + char *job_id; + size_t xnonce2_len; + unsigned char *xnonce2; +}; + +static struct work g_work; +static time_t g_work_time; +static pthread_mutex_t g_work_lock; + +static inline void work_free(struct work *w) +{ + free(w->job_id); + free(w->xnonce2); +} + +static inline void work_copy(struct work *dest, const struct work *src) +{ + memcpy(dest, src, sizeof(struct work)); + if (src->job_id) + dest->job_id = strdup(src->job_id); + if (src->xnonce2) { + dest->xnonce2 = malloc(src->xnonce2_len); + memcpy(dest->xnonce2, src->xnonce2, src->xnonce2_len); + } +} + +static bool jobj_binary(const json_t *obj, const char *key, + void *buf, size_t buflen) +{ + const char *hexstr; + json_t *tmp; + + tmp = json_object_get(obj, key); + if (unlikely(!tmp)) { + applog(LOG_ERR, "JSON key '%s' not found", key); + return false; + } + hexstr = json_string_value(tmp); + if (unlikely(!hexstr)) { + applog(LOG_ERR, "JSON key '%s' is not a string", key); + return false; + } + if (!hex2bin(buf, hexstr, buflen)) + return false; + + return true; +} + +static bool work_decode(const json_t *val, struct work *work) +{ + int i; + + if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data)))) { + applog(LOG_ERR, "JSON inval data"); + goto err_out; + } + if (unlikely(!jobj_binary(val, "target", work->target, sizeof(work->target)))) { + applog(LOG_ERR, "JSON inval target"); + goto err_out; + } + + for (i = 0; i < ARRAY_SIZE(work->data); i++) + work->data[i] = le32dec(work->data + i); + for (i = 0; i < ARRAY_SIZE(work->target); i++) + work->target[i] = le32dec(work->target + i); + + return true; + +err_out: + return false; +} + +static void share_result(int result, const char *reason) +{ + char s[345]; + double hashrate; + int i; + + hashrate = 0.; + pthread_mutex_lock(&stats_lock); + for (i = 0; i < opt_n_threads; i++) + hashrate += thr_hashrates[i]; + result ? accepted_count++ : rejected_count++; + pthread_mutex_unlock(&stats_lock); + + sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate); + applog(LOG_INFO, "accepted: %lu/%lu (%.2f%%), %s khash/s %s", + accepted_count, + accepted_count + rejected_count, + 100. * accepted_count / (accepted_count + rejected_count), + s, + result ? "(yay!!!)" : "(booooo)"); + + if (opt_debug && reason) + applog(LOG_DEBUG, "DEBUG: reject reason: %s", reason); +} + +static bool submit_upstream_work(CURL *curl, struct work *work) +{ + char *str = NULL; + json_t *val, *res, *reason; + char s[345]; + int i; + bool rc = false; + + /* pass if the previous hash is not the current previous hash */ + if (!submit_old && memcmp(work->data + 1, g_work.data + 1, 32)) { + if (opt_debug) + applog(LOG_DEBUG, "DEBUG: stale work detected, discarding"); + return true; + } + + if (have_stratum) { + uint32_t ntime, nonce; + char *ntimestr, *noncestr, *xnonce2str; + + le32enc(&ntime, work->data[17]); + le32enc(&nonce, work->data[19]); + ntimestr = bin2hex((const unsigned char *)(&ntime), 4); + noncestr = bin2hex((const unsigned char *)(&nonce), 4); + xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len); + sprintf(s, + "{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}", + rpc_user, work->job_id, xnonce2str, ntimestr, noncestr); + free(ntimestr); + free(noncestr); + free(xnonce2str); + + if (unlikely(!stratum_send_line(&stratum, s))) { + applog(LOG_ERR, "submit_upstream_work stratum_send_line failed"); + goto out; + } + } else { + /* build hex string */ + for (i = 0; i < ARRAY_SIZE(work->data); i++) + le32enc(work->data + i, work->data[i]); + str = bin2hex((unsigned char *)work->data, sizeof(work->data)); + if (unlikely(!str)) { + applog(LOG_ERR, "submit_upstream_work OOM"); + goto out; + } + + /* build JSON-RPC request */ + sprintf(s, + "{\"method\": \"getwork\", \"params\": [ \"%s\" ], \"id\":1}\r\n", + str); + + /* issue JSON-RPC request */ + val = json_rpc_call(curl, rpc_url, rpc_userpass, s, NULL, 0); + if (unlikely(!val)) { + applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); + goto out; + } + + res = json_object_get(val, "result"); + reason = json_object_get(val, "reject-reason"); + share_result(json_is_true(res), reason ? json_string_value(reason) : NULL); + + json_decref(val); + } + + rc = true; + +out: + free(str); + return rc; +} + +static const char *rpc_req = + "{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n"; + +static bool get_upstream_work(CURL *curl, struct work *work) +{ + json_t *val; + bool rc; + struct timeval tv_start, tv_end, diff; + + gettimeofday(&tv_start, NULL); + val = json_rpc_call(curl, rpc_url, rpc_userpass, rpc_req, NULL, 0); + gettimeofday(&tv_end, NULL); + + if (have_stratum) { + if (val) + json_decref(val); + return true; + } + + if (!val) + return false; + + rc = work_decode(json_object_get(val, "result"), work); + + if (opt_debug && rc) { + timeval_subtract(&diff, &tv_end, &tv_start); + applog(LOG_DEBUG, "DEBUG: got new work in %d ms", + diff.tv_sec * 1000 + diff.tv_usec / 1000); + } + + json_decref(val); + + return rc; +} + +static void workio_cmd_free(struct workio_cmd *wc) +{ + if (!wc) + return; + + switch (wc->cmd) { + case WC_SUBMIT_WORK: + work_free(wc->u.work); + free(wc->u.work); + break; + default: /* do nothing */ + break; + } + + memset(wc, 0, sizeof(*wc)); /* poison */ + free(wc); +} + +static bool workio_get_work(struct workio_cmd *wc, CURL *curl) +{ + struct work *ret_work; + int failures = 0; + + ret_work = calloc(1, sizeof(*ret_work)); + if (!ret_work) + return false; + + /* obtain new work from bitcoin via JSON-RPC */ + while (!get_upstream_work(curl, ret_work)) { + if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { + applog(LOG_ERR, "json_rpc_call failed, terminating workio thread"); + free(ret_work); + return false; + } + + /* pause, then restart work-request loop */ + applog(LOG_ERR, "json_rpc_call failed, retry after %d seconds", + opt_fail_pause); + sleep(opt_fail_pause); + } + + /* send work to requesting thread */ + if (!tq_push(wc->thr->q, ret_work)) + free(ret_work); + + return true; +} + +static bool workio_submit_work(struct workio_cmd *wc, CURL *curl) +{ + int failures = 0; + + /* submit solution to bitcoin via JSON-RPC */ + while (!submit_upstream_work(curl, wc->u.work)) { + if (unlikely((opt_retries >= 0) && (++failures > opt_retries))) { + applog(LOG_ERR, "...terminating workio thread"); + return false; + } + + /* pause, then restart work-request loop */ + applog(LOG_ERR, "...retry after %d seconds", + opt_fail_pause); + sleep(opt_fail_pause); + } + + return true; +} + +static void *workio_thread(void *userdata) +{ + struct thr_info *mythr = userdata; + CURL *curl; + bool ok = true; + + curl = curl_easy_init(); + if (unlikely(!curl)) { + applog(LOG_ERR, "CURL initialization failed"); + return NULL; + } + + while (ok) { + struct workio_cmd *wc; + + /* wait for workio_cmd sent to us, on our queue */ + wc = tq_pop(mythr->q, NULL); + if (!wc) { + ok = false; + break; + } + + /* process workio_cmd */ + switch (wc->cmd) { + case WC_GET_WORK: + ok = workio_get_work(wc, curl); + break; + case WC_SUBMIT_WORK: + ok = workio_submit_work(wc, curl); + break; + + default: /* should never happen */ + ok = false; + break; + } + + workio_cmd_free(wc); + } + + tq_freeze(mythr->q); + curl_easy_cleanup(curl); + + return NULL; +} + +static bool get_work(struct thr_info *thr, struct work *work) +{ + struct workio_cmd *wc; + struct work *work_heap; + + if (opt_benchmark) { + memset(work->data, 0x55, 76); + work->data[17] = swab32(time(NULL)); + memset(work->data + 19, 0x00, 52); + work->data[20] = 0x80000000; + work->data[31] = 0x00000280; + memset(work->target, 0x00, sizeof(work->target)); + return true; + } + + /* fill out work request message */ + wc = calloc(1, sizeof(*wc)); + if (!wc) + return false; + + wc->cmd = WC_GET_WORK; + wc->thr = thr; + + /* send work request to workio thread */ + if (!tq_push(thr_info[work_thr_id].q, wc)) { + workio_cmd_free(wc); + return false; + } + + /* wait for response, a unit of work */ + work_heap = tq_pop(thr->q, NULL); + if (!work_heap) + return false; + + /* copy returned work into storage provided by caller */ + memcpy(work, work_heap, sizeof(*work)); + free(work_heap); + + return true; +} + +static bool submit_work(struct thr_info *thr, const struct work *work_in) +{ + struct workio_cmd *wc; + + /* fill out work request message */ + wc = calloc(1, sizeof(*wc)); + if (!wc) + return false; + + wc->u.work = malloc(sizeof(*work_in)); + if (!wc->u.work) + goto err_out; + + wc->cmd = WC_SUBMIT_WORK; + wc->thr = thr; + work_copy(wc->u.work, work_in); + + /* send solution to workio thread */ + if (!tq_push(thr_info[work_thr_id].q, wc)) + goto err_out; + + return true; + +err_out: + workio_cmd_free(wc); + return false; +} + +static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work) +{ + unsigned char merkle_root[64]; + int i; + + pthread_mutex_lock(&sctx->work_lock); + + free(work->job_id); + work->job_id = strdup(sctx->job.job_id); + work->xnonce2_len = sctx->xnonce2_size; + work->xnonce2 = realloc(work->xnonce2, sctx->xnonce2_size); + memcpy(work->xnonce2, sctx->job.xnonce2, sctx->xnonce2_size); + + /* Generate merkle root */ + sha256d(merkle_root, sctx->job.coinbase, sctx->job.coinbase_size); + for (i = 0; i < sctx->job.merkle_count; i++) { + memcpy(merkle_root + 32, sctx->job.merkle[i], 32); + sha256d(merkle_root, merkle_root, 64); + } + + /* Increment extranonce2 */ + for (i = 0; i < sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++); + + /* Assemble block header */ + memset(work->data, 0, 128); + work->data[0] = le32dec(sctx->job.version); + for (i = 0; i < 8; i++) + work->data[1 + i] = le32dec((uint32_t *)sctx->job.prevhash + i); + for (i = 0; i < 8; i++) + work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); + work->data[17] = le32dec(sctx->job.ntime); + work->data[18] = le32dec(sctx->job.nbits); + work->data[20] = 0x80000000; + work->data[31] = 0x00000280; + + pthread_mutex_unlock(&sctx->work_lock); + + if (opt_debug) { + char *xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len); + applog(LOG_DEBUG, "DEBUG: job_id='%s' extranonce2=%s ntime=%08x", + work->job_id, xnonce2str, swab32(work->data[17])); + free(xnonce2str); + } + + if (opt_algo == ALGO_SCRYPT) + diff_to_target(work->target, sctx->job.diff / 65536.0); + else + diff_to_target(work->target, sctx->job.diff); +} + +static void *miner_thread(void *userdata) +{ + struct thr_info *mythr = userdata; + int thr_id = mythr->id; + struct work work = {{0}}; + uint32_t max_nonce; + uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20; + unsigned char *scratchbuf = NULL; + char s[16]; + int i; + + /* Set worker threads to nice 19 and then preferentially to SCHED_IDLE + * and if that fails, then SCHED_BATCH. No need for this to be an + * error if it fails */ + if (!opt_benchmark) { + setpriority(PRIO_PROCESS, 0, 19); + drop_policy(); + } + + /* Cpu affinity only makes sense if the number of threads is a multiple + * of the number of CPUs */ + if (num_processors > 1 && opt_n_threads % num_processors == 0) { + if (!opt_quiet) + applog(LOG_INFO, "Binding thread %d to cpu %d", + thr_id, thr_id % num_processors); + affine_to_cpu(thr_id, thr_id % num_processors); + } + + if (opt_algo == ALGO_SCRYPT) + { + scratchbuf = scrypt_buffer_alloc(); + } + + while (1) { + unsigned long hashes_done; + struct timeval tv_start, tv_end, diff; + int64_t max64; + int rc; + + if (have_stratum) { + while (time(NULL) >= g_work_time + 120) + sleep(1); + pthread_mutex_lock(&g_work_lock); + if (work.data[19] >= end_nonce && !memcmp(work.data, g_work.data, 76)) + stratum_gen_work(&stratum, &g_work); + } else { + /* obtain new work from internal workio thread */ + pthread_mutex_lock(&g_work_lock); + if (!have_stratum && (!have_longpoll || + time(NULL) >= g_work_time + LP_SCANTIME*3/4 || + work.data[19] >= end_nonce)) { + if (unlikely(!get_work(mythr, &g_work))) { + applog(LOG_ERR, "work retrieval failed, exiting " + "mining thread %d", mythr->id); + pthread_mutex_unlock(&g_work_lock); + goto out; + } + g_work_time = have_stratum ? 0 : time(NULL); + } + if (have_stratum) { + pthread_mutex_unlock(&g_work_lock); + continue; + } + } + if (memcmp(work.data, g_work.data, 76)) { + work_free(&work); + work_copy(&work, &g_work); + work.data[19] = 0xffffffffU / opt_n_threads * thr_id; + } else + work.data[19]++; + pthread_mutex_unlock(&g_work_lock); + work_restart[thr_id].restart = 0; + + /* adjust max_nonce to meet target scan time */ + if (have_stratum) + max64 = LP_SCANTIME; + else + max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime) + - time(NULL); + max64 *= thr_hashrates[thr_id]; + if (max64 <= 0) + max64 = opt_algo == ALGO_SCRYPT ? 0xfffLL : 0x1fffffLL; + if (work.data[19] + max64 > end_nonce) + max_nonce = end_nonce; + else + max_nonce = work.data[19] + max64; + + hashes_done = 0; + gettimeofday(&tv_start, NULL); + + /* scan nonces for a proof-of-work hash */ + switch (opt_algo) { + case ALGO_SCRYPT: + rc = scanhash_scrypt(thr_id, work.data, scratchbuf, work.target, + max_nonce, &hashes_done); + break; + + case ALGO_SHA256D: + rc = scanhash_sha256d(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + + case ALGO_KECCAK: + rc = scanhash_keccak(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + + case ALGO_HEAVY: + rc = scanhash_heavy(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + + case ALGO_QUARK: + rc = scanhash_quark(thr_id, work.data, work.target, + max_nonce, &hashes_done); + break; + + default: + /* should never happen */ + goto out; + } + + /* record scanhash elapsed time */ + gettimeofday(&tv_end, NULL); + timeval_subtract(&diff, &tv_end, &tv_start); + if (diff.tv_usec || diff.tv_sec) { + pthread_mutex_lock(&stats_lock); + thr_hashrates[thr_id] = + hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec); + pthread_mutex_unlock(&stats_lock); + } + if (!opt_quiet) { + sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f", + 1e-3 * thr_hashrates[thr_id]); + applog(LOG_INFO, "thread %d: %lu hashes, %s khash/s", + thr_id, hashes_done, s); + } + if (opt_benchmark && thr_id == opt_n_threads - 1) { + double hashrate = 0.; + for (i = 0; i < opt_n_threads && thr_hashrates[i]; i++) + hashrate += thr_hashrates[i]; + if (i == opt_n_threads) { + sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", 1e-3 * hashrate); + applog(LOG_INFO, "Total: %s khash/s", s); + } + } + + /* if nonce found, submit work */ + if (rc && !opt_benchmark && !submit_work(mythr, &work)) + break; + } + +out: + tq_freeze(mythr->q); + + return NULL; +} + +static void restart_threads(void) +{ + int i; + + for (i = 0; i < opt_n_threads; i++) + work_restart[i].restart = 1; +} + +static void *longpoll_thread(void *userdata) +{ + struct thr_info *mythr = userdata; + CURL *curl = NULL; + char *copy_start, *hdr_path = NULL, *lp_url = NULL; + bool need_slash = false; + + curl = curl_easy_init(); + if (unlikely(!curl)) { + applog(LOG_ERR, "CURL initialization failed"); + goto out; + } + +start: + hdr_path = tq_pop(mythr->q, NULL); + if (!hdr_path) + goto out; + + /* full URL */ + if (strstr(hdr_path, "://")) { + lp_url = hdr_path; + hdr_path = NULL; + } + + /* absolute path, on current server */ + else { + copy_start = (*hdr_path == '/') ? (hdr_path + 1) : hdr_path; + if (rpc_url[strlen(rpc_url) - 1] != '/') + need_slash = true; + + lp_url = malloc(strlen(rpc_url) + strlen(copy_start) + 2); + if (!lp_url) + goto out; + + sprintf(lp_url, "%s%s%s", rpc_url, need_slash ? "/" : "", copy_start); + } + + applog(LOG_INFO, "Long-polling activated for %s", lp_url); + + while (1) { + json_t *val, *soval; + int err; + + val = json_rpc_call(curl, lp_url, rpc_userpass, rpc_req, &err, + JSON_RPC_LONGPOLL); + if (have_stratum) { + if (val) + json_decref(val); + goto out; + } + if (likely(val)) { + applog(LOG_INFO, "LONGPOLL detected new block"); + soval = json_object_get(json_object_get(val, "result"), "submitold"); + submit_old = soval ? json_is_true(soval) : false; + pthread_mutex_lock(&g_work_lock); + if (work_decode(json_object_get(val, "result"), &g_work)) { + if (opt_debug) + applog(LOG_DEBUG, "DEBUG: got new work"); + time(&g_work_time); + restart_threads(); + } + pthread_mutex_unlock(&g_work_lock); + json_decref(val); + } else { + pthread_mutex_lock(&g_work_lock); + g_work_time -= LP_SCANTIME; + pthread_mutex_unlock(&g_work_lock); + if (err == CURLE_OPERATION_TIMEDOUT) { + restart_threads(); + } else { + have_longpoll = false; + restart_threads(); + free(hdr_path); + free(lp_url); + lp_url = NULL; + sleep(opt_fail_pause); + goto start; + } + } + } + +out: + free(hdr_path); + free(lp_url); + tq_freeze(mythr->q); + if (curl) + curl_easy_cleanup(curl); + + return NULL; +} + +static bool stratum_handle_response(char *buf) +{ + json_t *val, *err_val, *res_val, *id_val; + json_error_t err; + bool ret = false; + + val = JSON_LOADS(buf, &err); + if (!val) { + applog(LOG_INFO, "JSON decode failed(%d): %s", err.line, err.text); + goto out; + } + + res_val = json_object_get(val, "result"); + err_val = json_object_get(val, "error"); + id_val = json_object_get(val, "id"); + + if (!id_val || json_is_null(id_val) || !res_val) + goto out; + + share_result(json_is_true(res_val), + err_val ? json_string_value(json_array_get(err_val, 1)) : NULL); + + ret = true; +out: + if (val) + json_decref(val); + + return ret; +} + +static void *stratum_thread(void *userdata) +{ + struct thr_info *mythr = userdata; + char *s; + + stratum.url = tq_pop(mythr->q, NULL); + if (!stratum.url) + goto out; + applog(LOG_INFO, "Starting Stratum on %s", stratum.url); + + while (1) { + int failures = 0; + + while (!stratum.curl) { + pthread_mutex_lock(&g_work_lock); + g_work_time = 0; + pthread_mutex_unlock(&g_work_lock); + restart_threads(); + + if (!stratum_connect(&stratum, stratum.url) || + !stratum_subscribe(&stratum) || + !stratum_authorize(&stratum, rpc_user, rpc_pass)) { + stratum_disconnect(&stratum); + if (opt_retries >= 0 && ++failures > opt_retries) { + applog(LOG_ERR, "...terminating workio thread"); + tq_push(thr_info[work_thr_id].q, NULL); + goto out; + } + applog(LOG_ERR, "...retry after %d seconds", opt_fail_pause); + sleep(opt_fail_pause); + } + } + + if (stratum.job.job_id && + (!g_work_time || strcmp(stratum.job.job_id, g_work.job_id))) { + pthread_mutex_lock(&g_work_lock); + stratum_gen_work(&stratum, &g_work); + time(&g_work_time); + pthread_mutex_unlock(&g_work_lock); + if (stratum.job.clean) { + applog(LOG_INFO, "Stratum detected new block"); + restart_threads(); + } + } + + if (!stratum_socket_full(&stratum, 120)) { + applog(LOG_ERR, "Stratum connection timed out"); + s = NULL; + } else + s = stratum_recv_line(&stratum); + if (!s) { + stratum_disconnect(&stratum); + applog(LOG_ERR, "Stratum connection interrupted"); + continue; + } + if (!stratum_handle_method(&stratum, s)) + stratum_handle_response(s); + free(s); + } + +out: + return NULL; +} + +static void show_version_and_exit(void) +{ + printf(PACKAGE_STRING "\n built on " __DATE__ "\n features:" +#if defined(__i386__) + " i386" +#endif +#if defined(__x86_64__) + " x86_64" +#endif +#if defined(__i386__) || defined(__x86_64__) + " SSE2" +#endif +#if defined(__x86_64__) && defined(USE_AVX) + " AVX" +#endif +#if defined(__x86_64__) && defined(USE_AVX2) + " AVX2" +#endif +#if defined(__x86_64__) && defined(USE_XOP) + " XOP" +#endif +#if defined(__arm__) && defined(__APCS_32__) + " ARM" +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \ + defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \ + defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_7__) || \ + defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) + " ARMv5E" +#endif +#if defined(__ARM_NEON__) + " NEON" +#endif +#endif + "\n"); + + printf("%s\n", curl_version()); +#ifdef JANSSON_VERSION + printf("libjansson %s\n", JANSSON_VERSION); +#endif + exit(0); +} + +static void show_usage_and_exit(int status) +{ + if (status) + fprintf(stderr, "Try `" PROGRAM_NAME " --help' for more information.\n"); + else + printf(usage); + exit(status); +} + +static void parse_arg (int key, char *arg) +{ + char *p; + int v, i; + + switch(key) { + case 'a': + for (i = 0; i < ARRAY_SIZE(algo_names); i++) { + if (algo_names[i] && + !strcmp(arg, algo_names[i])) { + opt_algo = i; + break; + } + } + if (i == ARRAY_SIZE(algo_names)) + show_usage_and_exit(1); + break; + case 'B': + opt_background = true; + break; + case 'c': { + json_error_t err; + if (opt_config) + json_decref(opt_config); +#if JANSSON_VERSION_HEX >= 0x020000 + opt_config = json_load_file(arg, 0, &err); +#else + opt_config = json_load_file(arg, &err); +#endif + if (!json_is_object(opt_config)) { + applog(LOG_ERR, "JSON decode of %s failed", arg); + exit(1); + } + break; + } + case 'q': + opt_quiet = true; + break; + case 'D': + opt_debug = true; + break; + case 'p': + free(rpc_pass); + rpc_pass = strdup(arg); + break; + case 'P': + opt_protocol = true; + break; + case 'r': + v = atoi(arg); + if (v < -1 || v > 9999) /* sanity check */ + show_usage_and_exit(1); + opt_retries = v; + break; + case 'R': + v = atoi(arg); + if (v < 1 || v > 9999) /* sanity check */ + show_usage_and_exit(1); + opt_fail_pause = v; + break; + case 's': + v = atoi(arg); + if (v < 1 || v > 9999) /* sanity check */ + show_usage_and_exit(1); + opt_scantime = v; + break; + case 'T': + v = atoi(arg); + if (v < 1 || v > 99999) /* sanity check */ + show_usage_and_exit(1); + opt_timeout = v; + break; + case 't': + v = atoi(arg); + if (v < 1 || v > 9999) /* sanity check */ + show_usage_and_exit(1); + opt_n_threads = v; + break; + case 'u': + free(rpc_user); + rpc_user = strdup(arg); + break; + case 'o': /* --url */ + p = strstr(arg, "://"); + if (p) { + if (strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) && + strncasecmp(arg, "stratum+tcp://", 14)) + show_usage_and_exit(1); + free(rpc_url); + rpc_url = strdup(arg); + } else { + if (!strlen(arg) || *arg == '/') + show_usage_and_exit(1); + free(rpc_url); + rpc_url = malloc(strlen(arg) + 8); + sprintf(rpc_url, "http://%s", arg); + } + p = strrchr(rpc_url, '@'); + if (p) { + char *sp, *ap; + *p = '\0'; + ap = strstr(rpc_url, "://") + 3; + sp = strchr(ap, ':'); + if (sp) { + free(rpc_userpass); + rpc_userpass = strdup(ap); + free(rpc_user); + rpc_user = calloc(sp - ap + 1, 1); + strncpy(rpc_user, ap, sp - ap); + free(rpc_pass); + rpc_pass = strdup(sp + 1); + } else { + free(rpc_user); + rpc_user = strdup(ap); + } + memmove(ap, p + 1, strlen(p + 1) + 1); + } + have_stratum = !opt_benchmark && !strncasecmp(rpc_url, "stratum", 7); + break; + case 'O': /* --userpass */ + p = strchr(arg, ':'); + if (!p) + show_usage_and_exit(1); + free(rpc_userpass); + rpc_userpass = strdup(arg); + free(rpc_user); + rpc_user = calloc(p - arg + 1, 1); + strncpy(rpc_user, arg, p - arg); + free(rpc_pass); + rpc_pass = strdup(p + 1); + break; + case 'x': /* --proxy */ + if (!strncasecmp(arg, "socks4://", 9)) + opt_proxy_type = CURLPROXY_SOCKS4; + else if (!strncasecmp(arg, "socks5://", 9)) + opt_proxy_type = CURLPROXY_SOCKS5; +#if LIBCURL_VERSION_NUM >= 0x071200 + else if (!strncasecmp(arg, "socks4a://", 10)) + opt_proxy_type = CURLPROXY_SOCKS4A; + else if (!strncasecmp(arg, "socks5h://", 10)) + opt_proxy_type = CURLPROXY_SOCKS5_HOSTNAME; +#endif + else + opt_proxy_type = CURLPROXY_HTTP; + free(opt_proxy); + opt_proxy = strdup(arg); + break; + case 1001: + free(opt_cert); + opt_cert = strdup(arg); + break; + case 1005: + opt_benchmark = true; + want_longpoll = false; + want_stratum = false; + have_stratum = false; + break; + case 1003: + want_longpoll = false; + break; + case 1007: + want_stratum = false; + break; + case 1009: + opt_redirect = false; + break; + case 'S': + use_syslog = true; + break; + case 'V': + show_version_and_exit(); + case 'h': + show_usage_and_exit(0); + default: + show_usage_and_exit(1); + } +} + +static void parse_config(void) +{ + int i; + json_t *val; + + if (!json_is_object(opt_config)) + return; + + for (i = 0; i < ARRAY_SIZE(options); i++) { + if (!options[i].name) + break; + if (!strcmp(options[i].name, "config")) + continue; + + val = json_object_get(opt_config, options[i].name); + if (!val) + continue; + + if (options[i].has_arg && json_is_string(val)) { + char *s = strdup(json_string_value(val)); + if (!s) + break; + parse_arg(options[i].val, s); + free(s); + } else if (!options[i].has_arg && json_is_true(val)) + parse_arg(options[i].val, ""); + else + applog(LOG_ERR, "JSON option %s invalid", + options[i].name); + } +} + +static void parse_cmdline(int argc, char *argv[]) +{ + int key; + + while (1) { +#if HAVE_GETOPT_LONG + key = getopt_long(argc, argv, short_options, options, NULL); +#else + key = getopt(argc, argv, short_options); +#endif + if (key < 0) + break; + + parse_arg(key, optarg); + } + if (optind < argc) { + fprintf(stderr, "%s: unsupported non-option argument '%s'\n", + argv[0], argv[optind]); + show_usage_and_exit(1); + } + + parse_config(); +} + +#ifndef WIN32 +static void signal_handler(int sig) +{ + switch (sig) { + case SIGHUP: + applog(LOG_INFO, "SIGHUP received"); + break; + case SIGINT: + applog(LOG_INFO, "SIGINT received, exiting"); + exit(0); + break; + case SIGTERM: + applog(LOG_INFO, "SIGTERM received, exiting"); + exit(0); + break; + } +} +#endif + +int main(int argc, char *argv[]) +{ + struct thr_info *thr; + long flags; + int i; + + rpc_user = strdup(""); + rpc_pass = strdup(""); + + /* parse command line */ + parse_cmdline(argc, argv); + + if (!opt_benchmark && !rpc_url) { + fprintf(stderr, "%s: no URL supplied\n", argv[0]); + show_usage_and_exit(1); + } + + if (!rpc_userpass) { + rpc_userpass = malloc(strlen(rpc_user) + strlen(rpc_pass) + 2); + if (!rpc_userpass) + return 1; + sprintf(rpc_userpass, "%s:%s", rpc_user, rpc_pass); + } + + pthread_mutex_init(&applog_lock, NULL); + pthread_mutex_init(&stats_lock, NULL); + pthread_mutex_init(&g_work_lock, NULL); + pthread_mutex_init(&stratum.sock_lock, NULL); + pthread_mutex_init(&stratum.work_lock, NULL); + + flags = !opt_benchmark && strncmp(rpc_url, "https:", 6) + ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL) + : CURL_GLOBAL_ALL; + if (curl_global_init(flags)) { + applog(LOG_ERR, "CURL initialization failed"); + return 1; + } + +#ifndef WIN32 + if (opt_background) { + i = fork(); + if (i < 0) exit(1); + if (i > 0) exit(0); + i = setsid(); + if (i < 0) + applog(LOG_ERR, "setsid() failed (errno = %d)", errno); + i = chdir("/"); + if (i < 0) + applog(LOG_ERR, "chdir() failed (errno = %d)", errno); + signal(SIGHUP, signal_handler); + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); + } +#endif + +#if defined(WIN32) + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + num_processors = sysinfo.dwNumberOfProcessors; +#elif defined(_SC_NPROCESSORS_CONF) + num_processors = sysconf(_SC_NPROCESSORS_CONF); +#elif defined(CTL_HW) && defined(HW_NCPU) + int req[] = { CTL_HW, HW_NCPU }; + size_t len = sizeof(num_processors); + sysctl(req, 2, &num_processors, &len, NULL, 0); +#else + num_processors = 1; +#endif + if (num_processors < 1) + num_processors = 1; + if (!opt_n_threads) + opt_n_threads = num_processors; + +#ifdef HAVE_SYSLOG_H + if (use_syslog) + openlog("cpuminer", LOG_PID, LOG_USER); +#endif + + work_restart = calloc(opt_n_threads, sizeof(*work_restart)); + if (!work_restart) + return 1; + + thr_info = calloc(opt_n_threads + 3, sizeof(*thr)); + if (!thr_info) + return 1; + + thr_hashrates = (double *) calloc(opt_n_threads, sizeof(double)); + if (!thr_hashrates) + return 1; + + /* init workio thread info */ + work_thr_id = opt_n_threads; + thr = &thr_info[work_thr_id]; + thr->id = work_thr_id; + thr->q = tq_new(); + if (!thr->q) + return 1; + + /* start work I/O thread */ + if (pthread_create(&thr->pth, NULL, workio_thread, thr)) { + applog(LOG_ERR, "workio thread create failed"); + return 1; + } + + if (want_longpoll && !have_stratum) { + /* init longpoll thread info */ + longpoll_thr_id = opt_n_threads + 1; + thr = &thr_info[longpoll_thr_id]; + thr->id = longpoll_thr_id; + thr->q = tq_new(); + if (!thr->q) + return 1; + + /* start longpoll thread */ + if (unlikely(pthread_create(&thr->pth, NULL, longpoll_thread, thr))) { + applog(LOG_ERR, "longpoll thread create failed"); + return 1; + } + } + if (want_stratum) { + /* init stratum thread info */ + stratum_thr_id = opt_n_threads + 2; + thr = &thr_info[stratum_thr_id]; + thr->id = stratum_thr_id; + thr->q = tq_new(); + if (!thr->q) + return 1; + + /* start stratum thread */ + if (unlikely(pthread_create(&thr->pth, NULL, stratum_thread, thr))) { + applog(LOG_ERR, "stratum thread create failed"); + return 1; + } + + if (have_stratum) + tq_push(thr_info[stratum_thr_id].q, strdup(rpc_url)); + } + + /* start mining threads */ + for (i = 0; i < opt_n_threads; i++) { + thr = &thr_info[i]; + + thr->id = i; + thr->q = tq_new(); + if (!thr->q) + return 1; + + if (unlikely(pthread_create(&thr->pth, NULL, miner_thread, thr))) { + applog(LOG_ERR, "thread %d create failed", i); + return 1; + } + } + + applog(LOG_INFO, "%d miner threads started, " + "using '%s' algorithm.", + opt_n_threads, + algo_names[opt_algo]); + + /* main loop - simply wait for workio thread to exit */ + pthread_join(thr_info[work_thr_id].pth, NULL); + + applog(LOG_INFO, "workio thread dead, exiting."); + + return 0; +} diff --git a/elist.h b/elist.h new file mode 100644 index 00000000..b2e8263d --- /dev/null +++ b/elist.h @@ -0,0 +1,251 @@ +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define INIT_LIST_HEAD(ptr) do { \ + (ptr)->next = (ptr); (ptr)->prev = (ptr); \ +} while (0) + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head *prev, struct list_head *next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is in an undefined state. + */ +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = (void *) 0; + entry->prev = (void *) 0; +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(struct list_head *head) +{ + return head->next == head; +} + +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); \ + pos = pos->next) +/** + * list_for_each_prev - iterate over a list backwards + * @pos: the &struct list_head to use as a loop counter. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev; pos != (head); \ + pos = pos->prev) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop counter. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop counter. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** + * list_for_each_entry_continue - iterate over list of given type + * continuing after existing point + * @pos: the type * to use as a loop counter. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member), \ + prefetch(pos->member.next); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member), \ + prefetch(pos->member.next)) + +#endif diff --git a/example-cfg.json b/example-cfg.json new file mode 100644 index 00000000..228a66d5 --- /dev/null +++ b/example-cfg.json @@ -0,0 +1,13 @@ +{ + "_comment1" : "Any long-format command line argument ", + "_comment2" : "may be used in this JSON configuration file", + + "url" : "http://127.0.0.1:9332/", + "user" : "rpcuser", + "pass" : "rpcpass", + + "algo" : "scrypt", + "threads" : "4", + + "quiet" : true +} diff --git a/heavy.c b/heavy.c new file mode 100644 index 00000000..45654cf2 --- /dev/null +++ b/heavy.c @@ -0,0 +1,103 @@ +#include +#include +#include + +#include "miner.h" +#include "sha3/sph_hefty1.h" +#include "sha3/sph_keccak.h" +#include "sha3/sph_blake.h" +#include "sha3/sph_groestl.h" + +/* Combines top 64-bits from each hash into a single hash */ +static void combine_hashes(uint32_t *out, uint32_t *hash1, uint32_t *hash2, uint32_t *hash3, uint32_t *hash4) +{ + uint32_t *hash[4] = { hash1, hash2, hash3, hash4 }; + + /* Transpose first 64 bits of each hash into out */ + memset(out, 0, 32); + int bits = 0; + for (unsigned int i = 7; i >= 6; i--) { + for (uint32_t mask = 0x80000000; mask; mask >>= 1) { + for (unsigned int k = 0; k < 4; k++) { + out[(255 - bits)/32] <<= 1; + if ((hash[k][i] & mask) != 0) + out[(255 - bits)/32] |= 1; + bits++; + } + } + } +} + + + +void heavycoin_hash(unsigned char* output, const unsigned char* input, int len) +{ + unsigned char hash1[32]; + HEFTY1(input, len, hash1); + + /* HEFTY1 is new, so take an extra security measure to eliminate + * the possiblity of collisions: + * + * Hash(x) = SHA256(x + HEFTY1(x)) + * + * N.B. '+' is concatenation. + */ + unsigned char hash2[32];; + SHA256_CTX ctx; + SHA256_Init(&ctx); + SHA256_Update(&ctx, input, len); + SHA256_Update(&ctx, hash1, sizeof(hash1)); + SHA256_Final(hash2, &ctx); + + /* Additional security: Do not rely on a single cryptographic hash + * function. Instead, combine the outputs of 4 of the most secure + * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512 + * and BLAKE512. + */ + + uint32_t hash3[16]; + sph_keccak512_context keccakCtx; + sph_keccak512_init(&keccakCtx); + sph_keccak512(&keccakCtx, input, len); + sph_keccak512(&keccakCtx, hash1, sizeof(hash1)); + sph_keccak512_close(&keccakCtx, (void *)&hash3); + + uint32_t hash4[16]; + sph_groestl512_context groestlCtx; + sph_groestl512_init(&groestlCtx); + sph_groestl512(&groestlCtx, input, len); + sph_groestl512(&groestlCtx, hash1, sizeof(hash1)); + sph_groestl512_close(&groestlCtx, (void *)&hash4); + + uint32_t hash5[16]; + sph_blake512_context blakeCtx; + sph_blake512_init(&blakeCtx); + sph_blake512(&blakeCtx, input, len); + sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1)); + sph_blake512_close(&blakeCtx, (void *)&hash5); + + uint32_t *final = (uint32_t *)output; + combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5); +} + +int scanhash_heavy(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t hash[8]; + uint32_t start_nonce = pdata[19]; + + do { + heavycoin_hash((unsigned char *)hash, (unsigned char *)pdata, 80); + + if (hash[7] <= ptarget[7]) { + if (fulltest(hash, ptarget)) { + *hashes_done = pdata[19] - start_nonce; + return 1; + break; + } + } + pdata[19]++; + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + *hashes_done = pdata[19] - start_nonce; + return 0; +} \ No newline at end of file diff --git a/keccak.c b/keccak.c new file mode 100644 index 00000000..82b759f6 --- /dev/null +++ b/keccak.c @@ -0,0 +1,52 @@ +#include "cpuminer-config.h" +#include "miner.h" + +#include +#include + +#include "sha3/sph_keccak.h" + +static void keccakhash(void *state, const void *input) +{ + sph_keccak256_context ctx_keccak; + uint32_t hash[32]; + + sph_keccak256_init(&ctx_keccak); + sph_keccak256 (&ctx_keccak,input, 80); + sph_keccak256_close(&ctx_keccak, hash); + + memcpy(state, hash, 32); +} + +int scanhash_keccak(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + + uint32_t hash64[8] __attribute__((aligned(32))); + uint32_t endiandata[32]; + + int kk=0; + for (; kk < 32; kk++) + { + be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]); + }; + + do { + + pdata[19] = ++n; + be32enc(&endiandata[19], n); + keccakhash(hash64, &endiandata); + if (((hash64[7]&0xFFFFFF00)==0) && + fulltest(hash64, ptarget)) { + *hashes_done = n - first_nonce + 1; + return true; + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} \ No newline at end of file diff --git a/miner.h b/miner.h new file mode 100644 index 00000000..5979aece --- /dev/null +++ b/miner.h @@ -0,0 +1,264 @@ +#ifndef __MINER_H__ +#define __MINER_H__ + +#include "cpuminer-config.h" + +#include +#include +#include +#include +#include +#include + +#ifdef STDC_HEADERS +# include +# include +#else +# ifdef HAVE_STDLIB_H +# include +# endif +#endif +#ifdef HAVE_ALLOCA_H +# include +#elif !defined alloca +# ifdef __GNUC__ +# define alloca __builtin_alloca +# elif defined _AIX +# define alloca __alloca +# elif defined _MSC_VER +# include +# define alloca _alloca +# elif !defined HAVE_ALLOCA +# ifdef __cplusplus +extern "C" +# endif +void *alloca (size_t); +# endif +#endif + +#ifdef HAVE_SYSLOG_H +#include +#else +enum { + LOG_ERR, + LOG_WARNING, + LOG_NOTICE, + LOG_INFO, + LOG_DEBUG, +}; +#endif + +#undef unlikely +#undef likely +#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__) +#define unlikely(expr) (__builtin_expect(!!(expr), 0)) +#define likely(expr) (__builtin_expect(!!(expr), 1)) +#else +#define unlikely(expr) (expr) +#define likely(expr) (expr) +#endif + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#endif + +#if ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) +#define WANT_BUILTIN_BSWAP +#else +#define bswap_32(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) \ + | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) +#endif + +static inline uint32_t swab32(uint32_t v) +{ +#ifdef WANT_BUILTIN_BSWAP + return __builtin_bswap32(v); +#else + return bswap_32(v); +#endif +} + +#ifdef HAVE_SYS_ENDIAN_H +#include +#endif + +#if !HAVE_DECL_BE32DEC +static inline uint32_t be32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[3]) + ((uint32_t)(p[2]) << 8) + + ((uint32_t)(p[1]) << 16) + ((uint32_t)(p[0]) << 24)); +} +#endif + +#if !HAVE_DECL_LE32DEC +static inline uint32_t le32dec(const void *pp) +{ + const uint8_t *p = (uint8_t const *)pp; + return ((uint32_t)(p[0]) + ((uint32_t)(p[1]) << 8) + + ((uint32_t)(p[2]) << 16) + ((uint32_t)(p[3]) << 24)); +} +#endif + +#if !HAVE_DECL_BE32ENC +static inline void be32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[3] = x & 0xff; + p[2] = (x >> 8) & 0xff; + p[1] = (x >> 16) & 0xff; + p[0] = (x >> 24) & 0xff; +} +#endif + +#if !HAVE_DECL_LE32ENC +static inline void le32enc(void *pp, uint32_t x) +{ + uint8_t *p = (uint8_t *)pp; + p[0] = x & 0xff; + p[1] = (x >> 8) & 0xff; + p[2] = (x >> 16) & 0xff; + p[3] = (x >> 24) & 0xff; +} +#endif + +#if JANSSON_MAJOR_VERSION >= 2 +#define JSON_LOADS(str, err_ptr) json_loads((str), 0, (err_ptr)) +#else +#define JSON_LOADS(str, err_ptr) json_loads((str), (err_ptr)) +#endif + +#define USER_AGENT PACKAGE_NAME "/" PACKAGE_VERSION + +void sha256_init(uint32_t *state); +void sha256_transform(uint32_t *state, const uint32_t *block, int swap); +void sha256d(unsigned char *hash, const unsigned char *data, int len); + +#if defined(__ARM_NEON__) || defined(__i386__) || defined(__x86_64__) +#define HAVE_SHA256_4WAY 1 +int sha256_use_4way(); +void sha256_init_4way(uint32_t *state); +void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); +#endif + +#if defined(__x86_64__) && defined(USE_AVX2) +#define HAVE_SHA256_8WAY 1 +int sha256_use_8way(); +void sha256_init_8way(uint32_t *state); +void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap); +#endif + +extern int scanhash_sha256d(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); + +extern unsigned char *scrypt_buffer_alloc(); +extern int scanhash_scrypt(int thr_id, uint32_t *pdata, + unsigned char *scratchbuf, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done); + +extern int scanhash_keccak(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); + +extern int scanhash_heavy(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done); + +extern int scanhash_quark(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done); + +struct thr_info { + int id; + pthread_t pth; + struct thread_q *q; +}; + +struct work_restart { + volatile unsigned long restart; + char padding[128 - sizeof(unsigned long)]; +}; + +extern bool opt_debug; +extern bool opt_protocol; +extern bool opt_redirect; +extern int opt_timeout; +extern bool want_longpoll; +extern bool have_longpoll; +extern bool want_stratum; +extern bool have_stratum; +extern char *opt_cert; +extern char *opt_proxy; +extern long opt_proxy_type; +extern bool use_syslog; +extern pthread_mutex_t applog_lock; +extern struct thr_info *thr_info; +extern int longpoll_thr_id; +extern int stratum_thr_id; +extern struct work_restart *work_restart; + +#define JSON_RPC_LONGPOLL (1 << 0) +#define JSON_RPC_QUIET_404 (1 << 1) + +extern void applog(int prio, const char *fmt, ...); +extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass, + const char *rpc_req, int *curl_err, int flags); +extern char *bin2hex(const unsigned char *p, size_t len); +extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len); +extern int timeval_subtract(struct timeval *result, struct timeval *x, + struct timeval *y); +extern bool fulltest(const uint32_t *hash, const uint32_t *target); +extern void diff_to_target(uint32_t *target, double diff); + +struct stratum_job { + char *job_id; + unsigned char prevhash[32]; + size_t coinbase_size; + unsigned char *coinbase; + unsigned char *xnonce2; + int merkle_count; + unsigned char **merkle; + unsigned char version[4]; + unsigned char nbits[4]; + unsigned char ntime[4]; + bool clean; + double diff; +}; + +struct stratum_ctx { + char *url; + + CURL *curl; + char *curl_url; + char curl_err_str[CURL_ERROR_SIZE]; + curl_socket_t sock; + size_t sockbuf_size; + char *sockbuf; + pthread_mutex_t sock_lock; + + double next_diff; + + char *session_id; + size_t xnonce1_size; + unsigned char *xnonce1; + size_t xnonce2_size; + struct stratum_job job; + pthread_mutex_t work_lock; +}; + +bool stratum_socket_full(struct stratum_ctx *sctx, int timeout); +bool stratum_send_line(struct stratum_ctx *sctx, char *s); +char *stratum_recv_line(struct stratum_ctx *sctx); +bool stratum_connect(struct stratum_ctx *sctx, const char *url); +void stratum_disconnect(struct stratum_ctx *sctx); +bool stratum_subscribe(struct stratum_ctx *sctx); +bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass); +bool stratum_handle_method(struct stratum_ctx *sctx, const char *s); + +struct thread_q; + +extern struct thread_q *tq_new(void); +extern void tq_free(struct thread_q *tq); +extern bool tq_push(struct thread_q *tq, void *data); +extern void *tq_pop(struct thread_q *tq, const struct timespec *abstime); +extern void tq_freeze(struct thread_q *tq); +extern void tq_thaw(struct thread_q *tq); + +#endif /* __MINER_H__ */ diff --git a/minerd.1 b/minerd.1 new file mode 100644 index 00000000..f5561a03 --- /dev/null +++ b/minerd.1 @@ -0,0 +1,198 @@ +.TH MINERD 1 "February 2014" "cpuminer 2.3.3" +.SH NAME +minerd \- CPU miner for Bitcoin and Litecoin +.SH SYNOPSIS +.B minerd +[\fIOPTION\fR]... +.SH DESCRIPTION +.B minerd +is a multi-threaded CPU miner for Bitcoin, Litecoin and other cryptocurrencies. +It supports the getwork mining protocol as well as the Stratum mining protocol. +.PP +In its normal mode of operation, \fBminerd\fR connects to a mining server +(specified with the \fB\-o\fR option), receives work from it and starts hashing. +As soon as a solution is found, it is submitted to the same mining server, +which can accept or reject it. +When using the getwork protocol, \fBminerd\fR can take advantage +of the long polling extension, if the server supports it; +in any case, fresh work is fetched as needed. +When using the Stratum protocol this is not possible, +and the server is responsible for sending fresh work at least every minute; +if it fails to do so, +\fBminerd\fR may drop the connection and try reconnecting again. +.PP +By default, \fBminerd\fR writes all its messages to standard error. +On systems that have a syslog, the \fB\-\-syslog\fR option can be used +to write to it instead. +.PP +On start, the nice value of all miner threads is set to 19. +On Linux, the scheduling policy is also changed to SCHED_IDLE, +or to SCHED_BATCH if that fails. +On multiprocessor systems, \fBminerd\fR +automatically sets the CPU affinity of miner threads +if the number of threads is a multiple of the number of processors. +.SH EXAMPLES +To connect to a Litecoin mining pool that provides a Stratum server +at example.com on port 3333, authenticating as worker "foo" with password "bar": +.PP +.nf +.RS +minerd \-o stratum+tcp://example.com:3333 \-O foo:bar +.RE +.fi +.PP +To mine to a local Bitcoin testnet instance running on port 18332, +authenticating with username "rpcuser" and password "rpcpass": +.PP +.nf +.RS +minerd \-a sha256d \-o http://localhost:18332 \-O rpcuser:rpcpass +.RE +.fi +.PP +To connect to a Litecoin P2Pool node running on my.server on port 9327, +mining in the background and having output sent to the syslog facility, +omitting the per-thread hashmeter output: +.PP +.nf +.RS +minerd \-BSq \-o http://my.server:9327 +.RE +.fi +.SH OPTIONS +.TP +\fB\-a\fR, \fB\-\-algo\fR=\fIALGORITHM\fR +Set the hashing algorithm to use. +Default is scrypt. +Possible values are: +.RS 11 +.TP 10 +.B scrypt +scrypt(1024, 1, 1) (used by Litecoin) +.TP +.B sha256d +SHA-256d (used by Bitcoin) +.RE +.TP +\fB\-\-benchmark\fR +Run in offline benchmark mode. +.TP +\fB\-B\fR, \fB\-\-background\fR +Run in the background as a daemon. +.TP +\fB\-\-cert\fR=\fIFILE\fR +Set an SSL certificate to use with the mining server. +Only supported when using the HTTPS protocol. +.TP +\fB\-c\fR, \fB\-\-config\fR=\fIFILE\fR +Load options from a configuration file. +\fIFILE\fR must contain a JSON object +mapping long options to their arguments (as strings), +or to \fBtrue\fR if no argument is required. +Sample configuration file: + +.nf + { + "url": "stratum+tcp://example.com:3333", + "userpass": "foo:bar", + "retry-pause": "10", + "quiet": true + } +.fi +.TP +\fB\-D\fR, \fB\-\-debug\fR +Enable debug output. +.TP +\fB\-h\fR, \fB\-\-help\fR +Print a help message and exit. +.TP +\fB\-\-no\-longpoll\fR +Do not use long polling. +.TP +\fB\-\-no\-redirect\fR +Ignore requests from the server to switch to a different URL. +.TP +\fB\-\-no\-stratum\fR +Do not switch to Stratum, even if the server advertises support for it. +.TP +\fB\-o\fR, \fB\-\-url\fR=[\fISCHEME\fR://][\fIUSERNAME\fR[:\fIPASSWORD\fR]@]\fIHOST\fR:\fIPORT\fR[/\fIPATH\fR] +Set the URL of the mining server to connect to. +Supported schemes are \fBhttp\fR, \fBhttps\fR and \fBstratum+tcp\fR. +If no scheme is specified, http is assumed. +Specifying a \fIPATH\fR is only supported for HTTP and HTTPS. +Specifying credentials has the same effect as using the \fB\-O\fR option. +.TP +\fB\-O\fR, \fB\-\-userpass\fR=\fIUSERNAME\fR:\fIPASSWORD\fR +Set the credentials to use for connecting to the mining server. +Any value previously set with \fB\-u\fR or \fB\-p\fR is discarded. +.TP +\fB\-p\fR, \fB\-\-pass\fR=\fIPASSWORD\fR +Set the password to use for connecting to the mining server. +Any password previously set with \fB\-O\fR is discarded. +.TP +\fB\-P\fR, \fB\-\-protocol\-dump\fR +Enable output of all protocol-level activities. +.TP +\fB\-q\fR, \fB\-\-quiet\fR +Disable per-thread hashmeter output. +.TP +\fB\-r\fR, \fB\-\-retries\fR=\fIN\fR +Set the maximum number of times to retry if a network call fails. +If not specified, the miner will retry indefinitely. +.TP +\fB\-R\fR, \fB\-\-retry\-pause\fR=\fISECONDS\fR +Set how long to wait between retries. Default is 30 seconds. +.TP +\fB\-s\fR, \fB\-\-scantime\fR=\fISECONDS\fR +Set an upper bound on the time the miner can go without fetching fresh work. +This setting has no effect in Stratum mode or when long polling is activated. +Default is 5 seconds. +.TP +\fB\-S\fR, \fB\-\-syslog\fR +Log to the syslog facility instead of standard error. +.TP +\fB\-t\fR, \fB\-\-threads\fR=\fIN\fR +Set the number of miner threads. +If not specified, the miner will try to detect the number of available processors +and use that. +.TP +\fB\-T\fR, \fB\-\-timeout\fR=\fISECONDS\fR +Set a timeout for long polling. +.TP +\fB\-u\fR, \fB\-\-user\fR=\fIUSERNAME\fR +Set the username to use for connecting to the mining server. +Any username previously set with \fB\-O\fR is discarded. +.TP +\fB\-V\fR, \fB\-\-version\fR +Display version information and quit. +.TP +\fB\-x\fR, \fB\-\-proxy\fR=[\fISCHEME\fR://][\fIUSERNAME\fR:\fIPASSWORD\fR@]\fIHOST\fR:\fIPORT\fR +Connect to the mining server through a proxy. +Supported schemes are: \fBhttp\fR, \fBsocks4\fR, \fBsocks5\fR. +Since libcurl 7.18.0, the following are also supported: +\fBsocks4a\fR, \fBsocks5h\fR (SOCKS5 with remote name resolving). +If no scheme is specified, the proxy is treated as an HTTP proxy. +.SH ENVIRONMENT +The following environment variables can be specified in lower case or upper case; +the lower-case version has precedence. \fBhttp_proxy\fR is an exception +as it is only available in lower case. +.PP +.RS +.TP +\fBhttp_proxy\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR +Sets the proxy server to use for HTTP. +.TP +\fBHTTPS_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR +Sets the proxy server to use for HTTPS. +.TP +\fBALL_PROXY\fR [\fISCHEME\fR://]\fIHOST\fR:\fIPORT\fR +Sets the proxy server to use if no protocol-specific proxy is set. +.RE +.PP +Using an environment variable to set the proxy has the same effect as +using the \fB\-x\fR option. +.SH AUTHOR +Most of the code in the current version of minerd was written by +Pooler with contributions from others. + +The original minerd was written by Jeff Garzik . diff --git a/nomacro.pl b/nomacro.pl new file mode 100644 index 00000000..e91cda34 --- /dev/null +++ b/nomacro.pl @@ -0,0 +1,47 @@ +#!/usr/bin/perl +# Copyright 2012 pooler@litecoinpool.org +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) +# any later version. See COPYING for more details. +# +# nomacro.pl - convert assembler macros to C preprocessor macros. + +use strict; + +foreach my $f (<*.S>) { + rename $f, "$f.orig"; + open FIN, "$f.orig"; + open FOUT, ">$f"; + my $inmacro = 0; + my %macros = (); + while () { + if (m/^\.macro\s+([_0-9A-Z]+)(?:\s*)(.*)$/i) { + print FOUT "#define $1($2) \\\n"; + $macros{$1} = 1; + $inmacro = 1; + next; + } + if (m/^\.endm/) { + print FOUT "\n"; + $inmacro = 0; + next; + } + for my $m (keys %macros) { + s/^([ \t]*)($m)(?:[ \t]+([^#\n]*))?([;\n])/\1\2(\3)\4/; + } + if ($inmacro) { + if (m/^\s*#if/) { + $_ = while (!m/^\s*#endif/); + next; + } + next if (m/^\s*$/); + s/\\//g; + s/$/; \\/; + } + print FOUT; + } + close FOUT; + close FIN; +} diff --git a/quark.c b/quark.c new file mode 100644 index 00000000..d81487e5 --- /dev/null +++ b/quark.c @@ -0,0 +1,252 @@ +#include "cpuminer-config.h" +#include "miner.h" + +#include +#include + +#include "sha3/sph_blake.h" +#include "sha3/sph_bmw.h" +#include "sha3/sph_groestl.h" +#include "sha3/sph_jh.h" +#include "sha3/sph_keccak.h" +#include "sha3/sph_skein.h" + + +/* Move init out of loop, so init once externally, and then use one single memcpy with that bigger memory block */ +typedef struct { + sph_blake512_context blake1, blake2; + sph_bmw512_context bmw1, bmw2; + sph_groestl512_context groestl1, groestl2; + sph_skein512_context skein1, skein2; + sph_jh512_context jh1, jh2; + sph_keccak512_context keccak1, keccak2; +} quarkhash_context_holder; + +quarkhash_context_holder base_contexts; + +void init_quarkhash_contexts() +{ + sph_blake512_init(&base_contexts.blake1); + sph_bmw512_init(&base_contexts.bmw1); + sph_groestl512_init(&base_contexts.groestl1); + sph_skein512_init(&base_contexts.skein1); + sph_groestl512_init(&base_contexts.groestl2); + sph_jh512_init(&base_contexts.jh1); + sph_blake512_init(&base_contexts.blake2); + sph_bmw512_init(&base_contexts.bmw2); + sph_keccak512_init(&base_contexts.keccak1); + sph_skein512_init(&base_contexts.skein2); + sph_keccak512_init(&base_contexts.keccak2); + sph_jh512_init(&base_contexts.jh2); +} + +static void quarkhash(void *state, const void *input) +{ +// sph_blake512_context ctx_blake; +// sph_bmw512_context ctx_bmw; +// sph_groestl512_context ctx_groestl; +// sph_jh512_context ctx_jh; +// sph_keccak512_context ctx_keccak; +// sph_skein512_context ctx_skein; +// static unsigned char pblank[1]; + + quarkhash_context_holder ctx; + + uint32_t mask = 8; + uint32_t zero = 0; + + //these uint512 in the c++ source of the client are backed by an array of uint32 + uint32_t hashA[16], hashB[16]; + + + //do one memcopy to get fresh contexts, its faster even with a larger block then issuing 9 memcopies + memcpy(&ctx, &base_contexts, sizeof(base_contexts)); + + +// sph_blake512_init(&ctx.blake1); + sph_blake512 (&ctx.blake1, input, 80); + sph_blake512_close (&ctx.blake1, hashA); //0 + +// sph_bmw512_init(&ctx.bmw1); + sph_bmw512 (&ctx.bmw1, hashA, 64); //0 + sph_bmw512_close(&ctx.bmw1, hashB); //1 + + if ((hashB[0] & mask) != zero) //1 + { +// sph_groestl512_init(&ctx.groestl1); + sph_groestl512 (&ctx.groestl1, hashB, 64); //1 + sph_groestl512_close(&ctx.groestl1, hashA); //2 + } + else + { +// sph_skein512_init(&ctx.skein1); + sph_skein512 (&ctx.skein1, hashB, 64); //1 + sph_skein512_close(&ctx.skein1, hashA); //2 + } + +// sph_groestl512_init(&ctx.groestl2); + sph_groestl512 (&ctx.groestl2, hashA, 64); //2 + sph_groestl512_close(&ctx.groestl2, hashB); //3 + +// sph_jh512_init(&ctx.jh1); + sph_jh512 (&ctx.jh1, hashB, 64); //3 + sph_jh512_close(&ctx.jh1, hashA); //4 + + if ((hashA[0] & mask) != zero) //4 + { +// sph_blake512_init(&ctx.blake2); + sph_blake512 (&ctx.blake2, hashA, 64); // + sph_blake512_close(&ctx.blake2, hashB); //5 + } + else + { +// sph_bmw512_init(&ctx.bmw2); + sph_bmw512 (&ctx.bmw2, hashA, 64); //4 + sph_bmw512_close(&ctx.bmw2, hashB); //5 + } + +// sph_keccak512_init(&ctx.keccak1); + sph_keccak512 (&ctx.keccak1, hashB, 64); //5 + sph_keccak512_close(&ctx.keccak1, hashA); //6 + +// sph_skein512_init(&ctx.skein2); + sph_skein512 (&ctx.skein2, hashA, 64); //6 + sph_skein512_close(&ctx.skein2, hashB); //7 + + if ((hashB[0] & mask) != zero) //7 + { +// sph_keccak512_init(&ctx.keccak2); + sph_keccak512 (&ctx.keccak2, hashB, 64); // + sph_keccak512_close(&ctx.keccak2, hashA); //8 + } + else + { +// sph_jh512_init(&ctx.jh2); + sph_jh512 (&ctx.jh2, hashB, 64); //7 + sph_jh512_close(&ctx.jh2, hashA); //8 + } + + memcpy(state, hashA, 32); + +/* + int ii; + printf("result: "); + for (ii=0; ii < 32; ii++) + { + printf ("%.2x",((uint8_t*)state)[ii]); + }; + printf ("\n"); +*/ +} + +int scanhash_quark(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + + uint32_t hash64[8] __attribute__((aligned(32))); + uint32_t endiandata[32]; + + //char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"}; + + //we need bigendian data... + //lessons learned: do NOT endianchange directly in pdata, this will all proof-of-works be considered as stale from minerd.... + int kk=0; + for (; kk < 32; kk++) + { + be32enc(&endiandata[kk], ((uint32_t*)pdata)[kk]); + }; + +// if (opt_debug) +// { +// applog(LOG_DEBUG, "Thr: %02d, firstN: %08x, maxN: %08x, ToDo: %d", thr_id, first_nonce, max_nonce, max_nonce-first_nonce); +// } + + /* I'm to lazy to put the loop in an inline function... so dirty copy'n'paste.... */ + /* i know that i could set a variable, but i don't know how the compiler will optimize it, not that then the cpu needs to load the value *everytime* in a register */ + if (ptarget[7]==0) { + do { + pdata[19] = ++n; + be32enc(&endiandata[19], n); + quarkhash(hash64, &endiandata); + if (((hash64[7]&0xFFFFFFFF)==0) && + fulltest(hash64, ptarget)) { + *hashes_done = n - first_nonce + 1; + return true; + } + } while (n < max_nonce && !work_restart[thr_id].restart); + } + else if (ptarget[7]<=0xF) + { + do { + pdata[19] = ++n; + be32enc(&endiandata[19], n); + quarkhash(hash64, &endiandata); + if (((hash64[7]&0xFFFFFFF0)==0) && + fulltest(hash64, ptarget)) { + *hashes_done = n - first_nonce + 1; + return true; + } + } while (n < max_nonce && !work_restart[thr_id].restart); + } + else if (ptarget[7]<=0xFF) + { + do { + pdata[19] = ++n; + be32enc(&endiandata[19], n); + quarkhash(hash64, &endiandata); + if (((hash64[7]&0xFFFFFF00)==0) && + fulltest(hash64, ptarget)) { + *hashes_done = n - first_nonce + 1; + return true; + } + } while (n < max_nonce && !work_restart[thr_id].restart); + } + else if (ptarget[7]<=0xFFF) + { + do { + pdata[19] = ++n; + be32enc(&endiandata[19], n); + quarkhash(hash64, &endiandata); + if (((hash64[7]&0xFFFFF000)==0) && + fulltest(hash64, ptarget)) { + *hashes_done = n - first_nonce + 1; + return true; + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + } + else if (ptarget[7]<=0xFFFF) + { + do { + pdata[19] = ++n; + be32enc(&endiandata[19], n); + quarkhash(hash64, &endiandata); + if (((hash64[7]&0xFFFF0000)==0) && + fulltest(hash64, ptarget)) { + *hashes_done = n - first_nonce + 1; + return true; + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + } + else + { + do { + pdata[19] = ++n; + be32enc(&endiandata[19], n); + quarkhash(hash64, &endiandata); + if (fulltest(hash64, ptarget)) { + *hashes_done = n - first_nonce + 1; + return true; + } + } while (n < max_nonce && !work_restart[thr_id].restart); + } + + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} \ No newline at end of file diff --git a/scrypt-arm.S b/scrypt-arm.S new file mode 100644 index 00000000..5e2e29ca --- /dev/null +++ b/scrypt-arm.S @@ -0,0 +1,1173 @@ +/* + * Copyright 2012 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" + +#if defined(__arm__) && defined(__APCS_32__) + +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \ + defined(__ARM_ARCH_5TEJ__) || defined(__ARM_ARCH_6__) || \ + defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || \ + defined(__ARM_ARCH_6M__) || defined(__ARM_ARCH_6T2__) || \ + defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) +#define __ARM_ARCH_5E_OR_6__ +#endif + +#if defined(__ARM_ARCH_5E_OR_6__) || defined(__ARM_ARCH_7__) || \ + defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || \ + defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) +#define __ARM_ARCH_5E_OR_6_OR_7__ +#endif + +#ifdef __ARM_ARCH_5E_OR_6__ + +.macro scrypt_shuffle + add lr, r0, #9*4 + ldmia r0, {r2-r7} + ldmia lr, {r2, r8-r12, lr} + str r3, [r0, #5*4] + str r5, [r0, #15*4] + str r6, [r0, #12*4] + str r7, [r0, #1*4] + ldr r5, [r0, #7*4] + str r2, [r0, #13*4] + str r8, [r0, #2*4] + strd r4, [r0, #10*4] + str r9, [r0, #7*4] + str r10, [r0, #4*4] + str r11, [r0, #9*4] + str lr, [r0, #3*4] + + add r2, r0, #64+0*4 + add lr, r0, #64+9*4 + ldmia r2, {r2-r7} + ldmia lr, {r2, r8-r12, lr} + str r3, [r0, #64+5*4] + str r5, [r0, #64+15*4] + str r6, [r0, #64+12*4] + str r7, [r0, #64+1*4] + ldr r5, [r0, #64+7*4] + str r2, [r0, #64+13*4] + str r8, [r0, #64+2*4] + strd r4, [r0, #64+10*4] + str r9, [r0, #64+7*4] + str r10, [r0, #64+4*4] + str r11, [r0, #64+9*4] + str lr, [r0, #64+3*4] +.endm + +.macro salsa8_core_doubleround_body + add r6, r2, r6 + add r7, r3, r7 + eor r10, r10, r6, ror #25 + add r6, r0, r4 + eor r11, r11, r7, ror #25 + add r7, r1, r5 + strd r10, [sp, #14*4] + eor r12, r12, r6, ror #25 + eor lr, lr, r7, ror #25 + + ldrd r6, [sp, #10*4] + add r2, r10, r2 + add r3, r11, r3 + eor r6, r6, r2, ror #23 + add r2, r12, r0 + eor r7, r7, r3, ror #23 + add r3, lr, r1 + strd r6, [sp, #10*4] + eor r8, r8, r2, ror #23 + eor r9, r9, r3, ror #23 + + ldrd r2, [sp, #6*4] + add r10, r6, r10 + add r11, r7, r11 + eor r2, r2, r10, ror #19 + add r10, r8, r12 + eor r3, r3, r11, ror #19 + add r11, r9, lr + eor r4, r4, r10, ror #19 + eor r5, r5, r11, ror #19 + + ldrd r10, [sp, #2*4] + add r6, r2, r6 + add r7, r3, r7 + eor r10, r10, r6, ror #14 + add r6, r4, r8 + eor r11, r11, r7, ror #14 + add r7, r5, r9 + eor r0, r0, r6, ror #14 + eor r1, r1, r7, ror #14 + + + ldrd r6, [sp, #14*4] + strd r2, [sp, #6*4] + strd r10, [sp, #2*4] + add r6, r11, r6 + add r7, r0, r7 + eor r4, r4, r6, ror #25 + add r6, r1, r12 + eor r5, r5, r7, ror #25 + add r7, r10, lr + eor r2, r2, r6, ror #25 + eor r3, r3, r7, ror #25 + strd r2, [sp, #6*4] + + add r10, r3, r10 + ldrd r6, [sp, #10*4] + add r11, r4, r11 + eor r8, r8, r10, ror #23 + add r10, r5, r0 + eor r9, r9, r11, ror #23 + add r11, r2, r1 + eor r6, r6, r10, ror #23 + eor r7, r7, r11, ror #23 + strd r6, [sp, #10*4] + + add r2, r7, r2 + ldrd r10, [sp, #14*4] + add r3, r8, r3 + eor r12, r12, r2, ror #19 + add r2, r9, r4 + eor lr, lr, r3, ror #19 + add r3, r6, r5 + eor r10, r10, r2, ror #19 + eor r11, r11, r3, ror #19 + + ldrd r2, [sp, #2*4] + add r6, r11, r6 + add r7, r12, r7 + eor r0, r0, r6, ror #14 + add r6, lr, r8 + eor r1, r1, r7, ror #14 + add r7, r10, r9 + eor r2, r2, r6, ror #14 + eor r3, r3, r7, ror #14 +.endm + +.macro salsa8_core + ldmia sp, {r0-r12, lr} + + ldrd r10, [sp, #14*4] + salsa8_core_doubleround_body + ldrd r6, [sp, #6*4] + strd r2, [sp, #2*4] + strd r10, [sp, #14*4] + salsa8_core_doubleround_body + ldrd r6, [sp, #6*4] + strd r2, [sp, #2*4] + strd r10, [sp, #14*4] + salsa8_core_doubleround_body + ldrd r6, [sp, #6*4] + strd r2, [sp, #2*4] + strd r10, [sp, #14*4] + salsa8_core_doubleround_body + + stmia sp, {r0-r5} + strd r8, [sp, #8*4] + str r12, [sp, #12*4] + str lr, [sp, #13*4] + strd r10, [sp, #14*4] +.endm + +#else + +.macro scrypt_shuffle +.endm + +.macro salsa8_core_doubleround_body + ldr r8, [sp, #8*4] + add r11, r11, r10 + ldr lr, [sp, #13*4] + add r12, r12, r3 + eor r2, r2, r11, ror #23 + add r11, r4, r0 + eor r7, r7, r12, ror #23 + add r12, r9, r5 + str r9, [sp, #9*4] + eor r8, r8, r11, ror #23 + str r10, [sp, #14*4] + eor lr, lr, r12, ror #23 + + ldr r11, [sp, #11*4] + add r9, lr, r9 + ldr r12, [sp, #12*4] + add r10, r2, r10 + eor r1, r1, r9, ror #19 + add r9, r7, r3 + eor r6, r6, r10, ror #19 + add r10, r8, r4 + str r8, [sp, #8*4] + eor r11, r11, r9, ror #19 + str lr, [sp, #13*4] + eor r12, r12, r10, ror #19 + + ldr r9, [sp, #10*4] + add r8, r12, r8 + ldr r10, [sp, #15*4] + add lr, r1, lr + eor r0, r0, r8, ror #14 + add r8, r6, r2 + eor r5, r5, lr, ror #14 + add lr, r11, r7 + eor r9, r9, r8, ror #14 + ldr r8, [sp, #9*4] + eor r10, r10, lr, ror #14 + ldr lr, [sp, #14*4] + + + add r8, r9, r8 + str r9, [sp, #10*4] + add lr, r10, lr + str r10, [sp, #15*4] + eor r11, r11, r8, ror #25 + add r8, r0, r3 + eor r12, r12, lr, ror #25 + add lr, r5, r4 + eor r1, r1, r8, ror #25 + ldr r8, [sp, #8*4] + eor r6, r6, lr, ror #25 + + add r9, r11, r9 + ldr lr, [sp, #13*4] + add r10, r12, r10 + eor r8, r8, r9, ror #23 + add r9, r1, r0 + eor lr, lr, r10, ror #23 + add r10, r6, r5 + str r11, [sp, #11*4] + eor r2, r2, r9, ror #23 + str r12, [sp, #12*4] + eor r7, r7, r10, ror #23 + + ldr r9, [sp, #9*4] + add r11, r8, r11 + ldr r10, [sp, #14*4] + add r12, lr, r12 + eor r9, r9, r11, ror #19 + add r11, r2, r1 + eor r10, r10, r12, ror #19 + add r12, r7, r6 + str r8, [sp, #8*4] + eor r3, r3, r11, ror #19 + str lr, [sp, #13*4] + eor r4, r4, r12, ror #19 +.endm + +.macro salsa8_core + ldmia sp, {r0-r7} + + ldr r12, [sp, #15*4] + ldr r8, [sp, #11*4] + ldr lr, [sp, #12*4] + + ldr r9, [sp, #9*4] + add r8, r8, r12 + ldr r11, [sp, #10*4] + add lr, lr, r0 + eor r3, r3, r8, ror #25 + add r8, r5, r1 + ldr r10, [sp, #14*4] + eor r4, r4, lr, ror #25 + add lr, r11, r6 + eor r9, r9, r8, ror #25 + eor r10, r10, lr, ror #25 + + salsa8_core_doubleround_body + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + salsa8_core_doubleround_body + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + salsa8_core_doubleround_body + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + eor r11, r11, r8, ror #14 + add r8, r3, r2 + eor r12, r12, lr, ror #14 + add lr, r4, r7 + eor r0, r0, r8, ror #14 + ldr r8, [sp, #11*4] + eor r5, r5, lr, ror #14 + ldr lr, [sp, #12*4] + + add r8, r8, r12 + str r11, [sp, #10*4] + add lr, lr, r0 + str r12, [sp, #15*4] + eor r3, r3, r8, ror #25 + add r8, r5, r1 + eor r4, r4, lr, ror #25 + add lr, r11, r6 + str r9, [sp, #9*4] + eor r9, r9, r8, ror #25 + str r10, [sp, #14*4] + eor r10, r10, lr, ror #25 + + salsa8_core_doubleround_body + + ldr r11, [sp, #10*4] + add r8, r9, r8 + ldr r12, [sp, #15*4] + add lr, r10, lr + str r9, [sp, #9*4] + eor r11, r11, r8, ror #14 + eor r12, r12, lr, ror #14 + add r8, r3, r2 + str r10, [sp, #14*4] + add lr, r4, r7 + str r11, [sp, #10*4] + eor r0, r0, r8, ror #14 + str r12, [sp, #15*4] + eor r5, r5, lr, ror #14 + + stmia sp, {r0-r7} +.endm + +#endif + + +.macro scrypt_core_macro1a_x4 + ldmia r0, {r4-r7} + ldmia lr!, {r8-r11} + stmia r1!, {r4-r7} + stmia r3!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} +.endm + +.macro scrypt_core_macro1b_x4 + ldmia r3!, {r8-r11} + ldmia r2, {r4-r7} + eor r8, r8, r4 + eor r9, r9, r5 + eor r10, r10, r6 + eor r11, r11, r7 + ldmia r0, {r4-r7} + stmia r2!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + ldmia r1!, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r0!, {r4-r7} + stmia r12!, {r4-r7} +.endm + +.macro scrypt_core_macro2_x4 + ldmia r12, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} + ldmia r2, {r8-r11} + eor r4, r4, r8 + eor r5, r5, r9 + eor r6, r6, r10 + eor r7, r7, r11 + stmia r2!, {r4-r7} + stmia r12!, {r4-r7} +.endm + +.macro scrypt_core_macro3_x4 + ldmia r1!, {r4-r7} + ldmia r0, {r8-r11} + add r4, r4, r8 + add r5, r5, r9 + add r6, r6, r10 + add r7, r7, r11 + stmia r0!, {r4-r7} +.endm + +.macro scrypt_core_macro3_x6 + ldmia r1!, {r2-r7} + ldmia r0, {r8-r12, lr} + add r2, r2, r8 + add r3, r3, r9 + add r4, r4, r10 + add r5, r5, r11 + add r6, r6, r12 + add r7, r7, lr + stmia r0!, {r2-r7} +.endm + + + .text + .code 32 + .align 2 + .globl scrypt_core + .globl _scrypt_core +#ifdef __ELF__ + .type scrypt_core, %function +#endif +scrypt_core: +_scrypt_core: + stmfd sp!, {r4-r11, lr} + mov r12, sp + sub sp, sp, #21*4 + bic sp, sp, #63 + str r12, [sp, #20*4] + + scrypt_shuffle + + str r0, [sp, #16*4] + add r12, r1, #1024*32*4 + str r12, [sp, #18*4] +scrypt_core_loop1: + add lr, r0, #16*4 + add r3, r1, #16*4 + mov r12, sp + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + str r1, [sp, #17*4] + + salsa8_core + + ldr r0, [sp, #16*4] + mov r12, sp + add r2, r0, #16*4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + + salsa8_core + + ldr r0, [sp, #16*4] + mov r1, sp + add r0, r0, #16*4 + scrypt_core_macro3_x6 + scrypt_core_macro3_x6 + ldr r3, [sp, #17*4] + ldr r12, [sp, #18*4] + scrypt_core_macro3_x4 + + add r1, r3, #16*4 + sub r0, r0, #32*4 + cmp r1, r12 + bne scrypt_core_loop1 + + ldr r4, [r0, #16*4] + sub r1, r1, #1024*32*4 + str r1, [sp, #17*4] + mov r4, r4, lsl #32-10 + mov r12, #1024 + add r1, r1, r4, lsr #32-10-7 +scrypt_core_loop2: + add r2, r0, #16*4 + add r3, r1, #16*4 + str r12, [sp, #18*4] + mov r12, sp +#ifdef __ARM_ARCH_5E_OR_6_OR_7__ + pld [r1, #24*4] + pld [r1, #8*4] +#endif + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + + salsa8_core + + ldr r0, [sp, #16*4] + mov r12, sp + add r2, r0, #16*4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + + salsa8_core + + ldr r0, [sp, #16*4] + mov r1, sp + ldr r3, [sp, #17*4] + add r0, r0, #16*4 + scrypt_core_macro3_x4 + mov r4, r4, lsl #32-10 + add r3, r3, r4, lsr #32-10-7 + str r3, [sp, #19*4] +#ifdef __ARM_ARCH_5E_OR_6_OR_7__ + pld [r3, #16*4] + pld [r3] +#endif + scrypt_core_macro3_x6 + scrypt_core_macro3_x6 + + ldr r12, [sp, #18*4] + sub r0, r0, #32*4 + ldr r1, [sp, #19*4] + subs r12, r12, #1 + bne scrypt_core_loop2 + + scrypt_shuffle + + ldr sp, [sp, #20*4] +#ifdef __thumb__ + ldmfd sp!, {r4-r11, lr} + bx lr +#else + ldmfd sp!, {r4-r11, pc} +#endif + + +#ifdef __ARM_NEON__ + +.macro salsa8_core_3way_doubleround + ldrd r6, [sp, #6*4] + vadd.u32 q4, q0, q1 + add r6, r2, r6 + vadd.u32 q6, q8, q9 + add r7, r3, r7 + vshl.u32 q5, q4, #7 + eor r10, r10, r6, ror #25 + vshl.u32 q7, q6, #7 + add r6, r0, r4 + vshr.u32 q4, q4, #32-7 + eor r11, r11, r7, ror #25 + vshr.u32 q6, q6, #32-7 + add r7, r1, r5 + veor.u32 q3, q3, q5 + strd r10, [sp, #14*4] + veor.u32 q11, q11, q7 + eor r12, r12, r6, ror #25 + veor.u32 q3, q3, q4 + eor lr, lr, r7, ror #25 + veor.u32 q11, q11, q6 + + ldrd r6, [sp, #10*4] + vadd.u32 q4, q3, q0 + add r2, r10, r2 + vadd.u32 q6, q11, q8 + add r3, r11, r3 + vshl.u32 q5, q4, #9 + eor r6, r6, r2, ror #23 + vshl.u32 q7, q6, #9 + add r2, r12, r0 + vshr.u32 q4, q4, #32-9 + eor r7, r7, r3, ror #23 + vshr.u32 q6, q6, #32-9 + add r3, lr, r1 + veor.u32 q2, q2, q5 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q7 + eor r8, r8, r2, ror #23 + veor.u32 q2, q2, q4 + eor r9, r9, r3, ror #23 + veor.u32 q10, q10, q6 + + ldrd r2, [sp, #6*4] + vadd.u32 q4, q2, q3 + add r10, r6, r10 + vadd.u32 q6, q10, q11 + add r11, r7, r11 + vext.u32 q3, q3, q3, #3 + eor r2, r2, r10, ror #19 + vshl.u32 q5, q4, #13 + add r10, r8, r12 + vext.u32 q11, q11, q11, #3 + eor r3, r3, r11, ror #19 + vshl.u32 q7, q6, #13 + add r11, r9, lr + vshr.u32 q4, q4, #32-13 + eor r4, r4, r10, ror #19 + vshr.u32 q6, q6, #32-13 + eor r5, r5, r11, ror #19 + veor.u32 q1, q1, q5 + veor.u32 q9, q9, q7 + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + ldrd r10, [sp, #2*4] + vadd.u32 q4, q1, q2 + add r6, r2, r6 + vadd.u32 q6, q9, q10 + add r7, r3, r7 + vswp.u32 d4, d5 + eor r10, r10, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, r4, r8 + vswp.u32 d20, d21 + eor r11, r11, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r5, r9 + vshr.u32 q4, q4, #32-18 + eor r0, r0, r6, ror #14 + vshr.u32 q6, q6, #32-18 + eor r1, r1, r7, ror #14 + veor.u32 q0, q0, q5 + ldrd r6, [sp, #14*4] + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 + + + strd r2, [sp, #6*4] + vadd.u32 q4, q0, q3 + strd r10, [sp, #2*4] + vadd.u32 q6, q8, q11 + add r6, r11, r6 + vext.u32 q1, q1, q1, #1 + add r7, r0, r7 + vshl.u32 q5, q4, #7 + eor r4, r4, r6, ror #25 + vext.u32 q9, q9, q9, #1 + add r6, r1, r12 + vshl.u32 q7, q6, #7 + eor r5, r5, r7, ror #25 + vshr.u32 q4, q4, #32-7 + add r7, r10, lr + vshr.u32 q6, q6, #32-7 + eor r2, r2, r6, ror #25 + veor.u32 q1, q1, q5 + eor r3, r3, r7, ror #25 + veor.u32 q9, q9, q7 + strd r2, [sp, #6*4] + veor.u32 q1, q1, q4 + veor.u32 q9, q9, q6 + + add r10, r3, r10 + vadd.u32 q4, q1, q0 + ldrd r6, [sp, #10*4] + vadd.u32 q6, q9, q8 + add r11, r4, r11 + vshl.u32 q5, q4, #9 + eor r8, r8, r10, ror #23 + vshl.u32 q7, q6, #9 + add r10, r5, r0 + vshr.u32 q4, q4, #32-9 + eor r9, r9, r11, ror #23 + vshr.u32 q6, q6, #32-9 + add r11, r2, r1 + veor.u32 q2, q2, q5 + eor r6, r6, r10, ror #23 + veor.u32 q10, q10, q7 + eor r7, r7, r11, ror #23 + veor.u32 q2, q2, q4 + strd r6, [sp, #10*4] + veor.u32 q10, q10, q6 + + add r2, r7, r2 + vadd.u32 q4, q2, q1 + ldrd r10, [sp, #14*4] + vadd.u32 q6, q10, q9 + add r3, r8, r3 + vext.u32 q1, q1, q1, #3 + eor r12, r12, r2, ror #19 + vshl.u32 q5, q4, #13 + add r2, r9, r4 + vext.u32 q9, q9, q9, #3 + eor lr, lr, r3, ror #19 + vshl.u32 q7, q6, #13 + add r3, r6, r5 + vshr.u32 q4, q4, #32-13 + eor r10, r10, r2, ror #19 + vshr.u32 q6, q6, #32-13 + eor r11, r11, r3, ror #19 + veor.u32 q3, q3, q5 + veor.u32 q11, q11, q7 + veor.u32 q3, q3, q4 + veor.u32 q11, q11, q6 + + ldrd r2, [sp, #2*4] + vadd.u32 q4, q3, q2 + add r6, r11, r6 + vadd.u32 q6, q11, q10 + add r7, r12, r7 + vswp.u32 d4, d5 + eor r0, r0, r6, ror #14 + vshl.u32 q5, q4, #18 + add r6, lr, r8 + vswp.u32 d20, d21 + eor r1, r1, r7, ror #14 + vshl.u32 q7, q6, #18 + add r7, r10, r9 + vext.u32 q3, q3, q3, #1 + eor r2, r2, r6, ror #14 + vshr.u32 q4, q4, #32-18 + eor r3, r3, r7, ror #14 + vshr.u32 q6, q6, #32-18 + strd r2, [sp, #2*4] + vext.u32 q11, q11, q11, #1 + strd r10, [sp, #14*4] + veor.u32 q0, q0, q5 + veor.u32 q8, q8, q7 + veor.u32 q0, q0, q4 + veor.u32 q8, q8, q6 +.endm + +.macro salsa8_core_3way + ldmia sp, {r0-r12, lr} + ldrd r10, [sp, #14*4] + salsa8_core_3way_doubleround + salsa8_core_3way_doubleround + salsa8_core_3way_doubleround + salsa8_core_3way_doubleround + stmia sp, {r0-r5} + strd r8, [sp, #8*4] + str r12, [sp, #12*4] + str lr, [sp, #13*4] +.endm + + .text + .code 32 + .align 2 + .globl scrypt_core_3way + .globl _scrypt_core_3way +#ifdef __ELF__ + .type scrypt_core_3way, %function +#endif +scrypt_core_3way: +_scrypt_core_3way: + stmfd sp!, {r4-r11, lr} + vpush {q4-q7} + mov r12, sp + sub sp, sp, #24*16 + bic sp, sp, #63 + str r12, [sp, #4*16+3*4] + + mov r2, r0 + vldmia r2!, {q8-q15} + vmov.u64 q0, #0xffffffff + vmov.u32 q1, q8 + vmov.u32 q2, q12 + vbif.u32 q8, q9, q0 + vbif.u32 q12, q13, q0 + vbif.u32 q9, q10, q0 + vbif.u32 q13, q14, q0 + vbif.u32 q10, q11, q0 + vbif.u32 q14, q15, q0 + vbif.u32 q11, q1, q0 + vbif.u32 q15, q2, q0 + vldmia r2!, {q0-q7} + vswp.u32 d17, d21 + vswp.u32 d25, d29 + vswp.u32 d18, d22 + vswp.u32 d26, d30 + vstmia r0, {q8-q15} + vmov.u64 q8, #0xffffffff + vmov.u32 q9, q0 + vmov.u32 q10, q4 + vbif.u32 q0, q1, q8 + vbif.u32 q4, q5, q8 + vbif.u32 q1, q2, q8 + vbif.u32 q5, q6, q8 + vbif.u32 q2, q3, q8 + vbif.u32 q6, q7, q8 + vbif.u32 q3, q9, q8 + vbif.u32 q7, q10, q8 + vldmia r2, {q8-q15} + vswp.u32 d1, d5 + vswp.u32 d9, d13 + vswp.u32 d2, d6 + vswp.u32 d10, d14 + add r12, sp, #8*16 + vstmia r12!, {q0-q7} + vmov.u64 q0, #0xffffffff + vmov.u32 q1, q8 + vmov.u32 q2, q12 + vbif.u32 q8, q9, q0 + vbif.u32 q12, q13, q0 + vbif.u32 q9, q10, q0 + vbif.u32 q13, q14, q0 + vbif.u32 q10, q11, q0 + vbif.u32 q14, q15, q0 + vbif.u32 q11, q1, q0 + vbif.u32 q15, q2, q0 + vswp.u32 d17, d21 + vswp.u32 d25, d29 + vswp.u32 d18, d22 + vswp.u32 d26, d30 + vstmia r12, {q8-q15} + + add lr, sp, #128 + vldmia lr, {q0-q7} + add r2, r1, #1024*32*4 + str r0, [sp, #4*16+0*4] + str r2, [sp, #4*16+2*4] +scrypt_core_3way_loop1: + add lr, r0, #16*4 + add r3, r1, #16*4 + str r1, [sp, #4*16+1*4] + mov r12, sp + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4 + sub r1, r1, #4*16 + + add r1, r1, #1024*32*4 + vstmia r1, {q0-q7} + add r3, r1, #1024*32*4 + vstmia r3, {q8-q15} + + add lr, sp, #128 + veor.u32 q0, q0, q4 + veor.u32 q1, q1, q5 + veor.u32 q2, q2, q6 + veor.u32 q3, q3, q7 + vstmia lr, {q0-q3} + veor.u32 q8, q8, q12 + veor.u32 q9, q9, q13 + veor.u32 q10, q10, q14 + veor.u32 q11, q11, q15 + add r12, sp, #256 + vstmia r12, {q8-q11} + + salsa8_core_3way + + ldr r0, [sp, #4*16+0*4] + mov r12, sp + add r2, r0, #16*4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + + add lr, sp, #128 + vldmia lr, {q4-q7} + vadd.u32 q4, q4, q0 + vadd.u32 q5, q5, q1 + vadd.u32 q6, q6, q2 + vadd.u32 q7, q7, q3 + add r12, sp, #256 + vldmia r12, {q0-q3} + vstmia lr, {q4-q7} + vadd.u32 q8, q8, q0 + vadd.u32 q9, q9, q1 + vadd.u32 q10, q10, q2 + vadd.u32 q11, q11, q3 + + add r4, sp, #128+4*16 + vldmia r4, {q0-q3} + vstmia r12, {q8-q11} + veor.u32 q0, q0, q4 + veor.u32 q1, q1, q5 + veor.u32 q2, q2, q6 + veor.u32 q3, q3, q7 + vstmia r4, {q0-q3} + veor.u32 q8, q8, q12 + veor.u32 q9, q9, q13 + veor.u32 q10, q10, q14 + veor.u32 q11, q11, q15 + vmov q12, q8 + vmov q13, q9 + vmov q14, q10 + vmov q15, q11 + + salsa8_core_3way + + ldr r0, [sp, #4*16+0*4] + mov r1, sp + add r0, r0, #16*4 + scrypt_core_macro3_x6 + scrypt_core_macro3_x6 + scrypt_core_macro3_x4 + sub r0, r0, #8*16 + + ldr r1, [sp, #4*16+1*4] + ldr r2, [sp, #4*16+2*4] + add lr, sp, #128 + add r4, sp, #128+4*16 + vldmia r4, {q4-q7} + vadd.u32 q4, q4, q0 + vadd.u32 q5, q5, q1 + vadd.u32 q6, q6, q2 + vadd.u32 q7, q7, q3 + vstmia r4, {q4-q7} + vldmia lr, {q0-q3} + vadd.u32 q12, q12, q8 + vadd.u32 q13, q13, q9 + vadd.u32 q14, q14, q10 + vadd.u32 q15, q15, q11 + add r12, sp, #256 + vldmia r12, {q8-q11} + + add r1, r1, #8*16 + cmp r1, r2 + bne scrypt_core_3way_loop1 + + add r5, sp, #256+4*16 + vstmia r5, {q12-q15} + + sub r1, r1, #1024*32*4 + str r1, [sp, #4*16+1*4] + mov r2, #1024 +scrypt_core_3way_loop2: + str r2, [sp, #4*16+2*4] + + ldr r0, [sp, #4*16+0*4] + ldr r1, [sp, #4*16+1*4] + ldr r4, [r0, #16*4] + mov r4, r4, lsl #32-10 + add r1, r1, r4, lsr #32-10-7 + add r2, r0, #16*4 + add r3, r1, #16*4 + mov r12, sp + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4 + + ldr r1, [sp, #4*16+1*4] + add r1, r1, #1024*32*4 + add r3, r1, #1024*32*4 + vmov r6, r7, d8 + mov r6, r6, lsl #32-10 + add r6, r1, r6, lsr #32-10-7 + vmov r7, r8, d24 + add lr, sp, #128 + vldmia lr, {q0-q3} + pld [r6] + pld [r6, #8*4] + pld [r6, #16*4] + pld [r6, #24*4] + vldmia r6, {q8-q15} + mov r7, r7, lsl #32-10 + add r7, r3, r7, lsr #32-10-7 + veor.u32 q8, q8, q0 + veor.u32 q9, q9, q1 + veor.u32 q10, q10, q2 + veor.u32 q11, q11, q3 + pld [r7] + pld [r7, #8*4] + pld [r7, #16*4] + pld [r7, #24*4] + veor.u32 q12, q12, q4 + veor.u32 q13, q13, q5 + veor.u32 q14, q14, q6 + veor.u32 q15, q15, q7 + vldmia r7, {q0-q7} + vstmia lr, {q8-q15} + add r12, sp, #256 + vldmia r12, {q8-q15} + veor.u32 q8, q8, q0 + veor.u32 q9, q9, q1 + veor.u32 q10, q10, q2 + veor.u32 q11, q11, q3 + veor.u32 q12, q12, q4 + veor.u32 q13, q13, q5 + veor.u32 q14, q14, q6 + veor.u32 q15, q15, q7 + + vldmia lr, {q0-q7} + veor.u32 q0, q0, q4 + veor.u32 q1, q1, q5 + veor.u32 q2, q2, q6 + veor.u32 q3, q3, q7 + vstmia lr, {q0-q3} + veor.u32 q8, q8, q12 + veor.u32 q9, q9, q13 + veor.u32 q10, q10, q14 + veor.u32 q11, q11, q15 + vstmia r12, {q8-q15} + + salsa8_core_3way + + ldr r0, [sp, #4*16+0*4] + mov r12, sp + add r2, r0, #16*4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + scrypt_core_macro2_x4 + + add lr, sp, #128 + vldmia lr, {q4-q7} + vadd.u32 q4, q4, q0 + vadd.u32 q5, q5, q1 + vadd.u32 q6, q6, q2 + vadd.u32 q7, q7, q3 + add r12, sp, #256 + vldmia r12, {q12-q15} + vstmia lr, {q4-q7} + vadd.u32 q12, q12, q8 + vadd.u32 q13, q13, q9 + vadd.u32 q14, q14, q10 + vadd.u32 q15, q15, q11 + + add r4, sp, #128+4*16 + vldmia r4, {q0-q3} + vstmia r12, {q12-q15} + veor.u32 q0, q0, q4 + veor.u32 q1, q1, q5 + veor.u32 q2, q2, q6 + veor.u32 q3, q3, q7 + add r5, sp, #256+4*16 + vldmia r5, {q8-q11} + vstmia r4, {q0-q3} + veor.u32 q8, q8, q12 + veor.u32 q9, q9, q13 + veor.u32 q10, q10, q14 + veor.u32 q11, q11, q15 + vmov q12, q8 + vmov q13, q9 + vmov q14, q10 + vmov q15, q11 + + salsa8_core_3way + + ldr r0, [sp, #4*16+0*4] + ldr r3, [sp, #4*16+1*4] + mov r1, sp + add r0, r0, #16*4 + scrypt_core_macro3_x4 + mov r4, r4, lsl #32-10 + add r3, r3, r4, lsr #32-10-7 + pld [r3, #16*4] + pld [r3] + pld [r3, #24*4] + pld [r3, #8*4] + scrypt_core_macro3_x6 + scrypt_core_macro3_x6 + + add lr, sp, #128 + add r4, sp, #128+4*16 + vldmia r4, {q4-q7} + vadd.u32 q4, q4, q0 + vadd.u32 q5, q5, q1 + vadd.u32 q6, q6, q2 + vadd.u32 q7, q7, q3 + vstmia r4, {q4-q7} + vadd.u32 q12, q12, q8 + vadd.u32 q13, q13, q9 + vadd.u32 q14, q14, q10 + vadd.u32 q15, q15, q11 + add r5, sp, #256+4*16 + vstmia r5, {q12-q15} + + ldr r2, [sp, #4*16+2*4] + subs r2, r2, #1 + bne scrypt_core_3way_loop2 + + ldr r0, [sp, #4*16+0*4] + vldmia r0, {q8-q15} + vmov.u64 q0, #0xffffffff + vmov.u32 q1, q8 + vmov.u32 q2, q12 + vbif.u32 q8, q9, q0 + vbif.u32 q12, q13, q0 + vbif.u32 q9, q10, q0 + vbif.u32 q13, q14, q0 + vbif.u32 q10, q11, q0 + vbif.u32 q14, q15, q0 + vbif.u32 q11, q1, q0 + vbif.u32 q15, q2, q0 + add r12, sp, #8*16 + vldmia r12!, {q0-q7} + vswp.u32 d17, d21 + vswp.u32 d25, d29 + vswp.u32 d18, d22 + vswp.u32 d26, d30 + vstmia r0!, {q8-q15} + vmov.u64 q8, #0xffffffff + vmov.u32 q9, q0 + vmov.u32 q10, q4 + vbif.u32 q0, q1, q8 + vbif.u32 q4, q5, q8 + vbif.u32 q1, q2, q8 + vbif.u32 q5, q6, q8 + vbif.u32 q2, q3, q8 + vbif.u32 q6, q7, q8 + vbif.u32 q3, q9, q8 + vbif.u32 q7, q10, q8 + vldmia r12, {q8-q15} + vswp.u32 d1, d5 + vswp.u32 d9, d13 + vswp.u32 d2, d6 + vswp.u32 d10, d14 + vstmia r0!, {q0-q7} + vmov.u64 q0, #0xffffffff + vmov.u32 q1, q8 + vmov.u32 q2, q12 + vbif.u32 q8, q9, q0 + vbif.u32 q12, q13, q0 + vbif.u32 q9, q10, q0 + vbif.u32 q13, q14, q0 + vbif.u32 q10, q11, q0 + vbif.u32 q14, q15, q0 + vbif.u32 q11, q1, q0 + vbif.u32 q15, q2, q0 + vswp.u32 d17, d21 + vswp.u32 d25, d29 + vswp.u32 d18, d22 + vswp.u32 d26, d30 + vstmia r0, {q8-q15} + + ldr sp, [sp, #4*16+3*4] + vpop {q4-q7} + ldmfd sp!, {r4-r11, pc} + +#endif /* __ARM_NEON__ */ + +#endif diff --git a/scrypt-x64.S b/scrypt-x64.S new file mode 100644 index 00000000..c95fa45f --- /dev/null +++ b/scrypt-x64.S @@ -0,0 +1,2879 @@ +/* + * Copyright 2011-2013 pooler@litecoinpool.org + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "cpuminer-config.h" + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + +#if defined(__x86_64__) + + .text + .p2align 6 + .globl scrypt_best_throughput + .globl _scrypt_best_throughput +scrypt_best_throughput: +_scrypt_best_throughput: + pushq %rbx +#if defined(USE_AVX2) + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne scrypt_best_throughput_no_avx2 + /* Check for AVX2 support */ + movl $7, %eax + xorl %ecx, %ecx + cpuid + andl $0x00000020, %ebx + cmpl $0x00000020, %ebx + jne scrypt_best_throughput_no_avx2 + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne scrypt_best_throughput_no_avx2 + movl $6, %eax + jmp scrypt_best_throughput_exit +scrypt_best_throughput_no_avx2: +#endif + /* Check for AuthenticAMD */ + xorq %rax, %rax + cpuid + movl $3, %eax + cmpl $0x444d4163, %ecx + jne scrypt_best_throughput_not_amd + cmpl $0x69746e65, %edx + jne scrypt_best_throughput_not_amd + cmpl $0x68747541, %ebx + jne scrypt_best_throughput_not_amd + /* Check for AMD K8 or Bobcat */ + movl $1, %eax + cpuid + andl $0x0ff00000, %eax + jz scrypt_best_throughput_one + cmpl $0x00500000, %eax + je scrypt_best_throughput_one + movl $3, %eax + jmp scrypt_best_throughput_exit +scrypt_best_throughput_not_amd: + /* Check for GenuineIntel */ + cmpl $0x6c65746e, %ecx + jne scrypt_best_throughput_exit + cmpl $0x49656e69, %edx + jne scrypt_best_throughput_exit + cmpl $0x756e6547, %ebx + jne scrypt_best_throughput_exit + /* Check for Intel Atom */ + movl $1, %eax + cpuid + movl %eax, %edx + andl $0x0ff00f00, %eax + cmpl $0x00000600, %eax + movl $3, %eax + jnz scrypt_best_throughput_exit + andl $0x000f00f0, %edx + cmpl $0x000100c0, %edx + je scrypt_best_throughput_one + cmpl $0x00020060, %edx + je scrypt_best_throughput_one + cmpl $0x00030060, %edx + jne scrypt_best_throughput_exit +scrypt_best_throughput_one: + movl $1, %eax +scrypt_best_throughput_exit: + popq %rbx + ret + + +.macro scrypt_shuffle src, so, dest, do + movl \so+60(\src), %r8d + movl \so+44(\src), %r9d + movl \so+28(\src), %r10d + movl \so+12(\src), %r11d + movl %r8d, \do+12(\dest) + movl %r9d, \do+28(\dest) + movl %r10d, \do+44(\dest) + movl %r11d, \do+60(\dest) + movl \so+40(\src), %r8d + movl \so+8(\src), %r9d + movl \so+48(\src), %r10d + movl \so+16(\src), %r11d + movl %r8d, \do+8(\dest) + movl %r9d, \do+40(\dest) + movl %r10d, \do+16(\dest) + movl %r11d, \do+48(\dest) + movl \so+20(\src), %r8d + movl \so+4(\src), %r9d + movl \so+52(\src), %r10d + movl \so+36(\src), %r11d + movl %r8d, \do+4(\dest) + movl %r9d, \do+20(\dest) + movl %r10d, \do+36(\dest) + movl %r11d, \do+52(\dest) + movl \so+0(\src), %r8d + movl \so+24(\src), %r9d + movl \so+32(\src), %r10d + movl \so+56(\src), %r11d + movl %r8d, \do+0(\dest) + movl %r9d, \do+24(\dest) + movl %r10d, \do+32(\dest) + movl %r11d, \do+56(\dest) +.endm + + +.macro salsa8_core_gen_doubleround + movq 72(%rsp), %r15 + + leaq (%r14, %rdx), %rbp + roll $7, %ebp + xorl %ebp, %r9d + leaq (%rdi, %r15), %rbp + roll $7, %ebp + xorl %ebp, %r10d + leaq (%rdx, %r9), %rbp + roll $9, %ebp + xorl %ebp, %r11d + leaq (%r15, %r10), %rbp + roll $9, %ebp + xorl %ebp, %r13d + + leaq (%r9, %r11), %rbp + roll $13, %ebp + xorl %ebp, %r14d + leaq (%r10, %r13), %rbp + roll $13, %ebp + xorl %ebp, %edi + leaq (%r11, %r14), %rbp + roll $18, %ebp + xorl %ebp, %edx + leaq (%r13, %rdi), %rbp + roll $18, %ebp + xorl %ebp, %r15d + + movq 48(%rsp), %rbp + movq %r15, 72(%rsp) + + leaq (%rax, %rbp), %r15 + roll $7, %r15d + xorl %r15d, %ebx + leaq (%rbp, %rbx), %r15 + roll $9, %r15d + xorl %r15d, %ecx + leaq (%rbx, %rcx), %r15 + roll $13, %r15d + xorl %r15d, %eax + leaq (%rcx, %rax), %r15 + roll $18, %r15d + xorl %r15d, %ebp + + movq 88(%rsp), %r15 + movq %rbp, 48(%rsp) + + leaq (%r12, %r15), %rbp + roll $7, %ebp + xorl %ebp, %esi + leaq (%r15, %rsi), %rbp + roll $9, %ebp + xorl %ebp, %r8d + leaq (%rsi, %r8), %rbp + roll $13, %ebp + xorl %ebp, %r12d + leaq (%r8, %r12), %rbp + roll $18, %ebp + xorl %ebp, %r15d + + movq %r15, 88(%rsp) + movq 72(%rsp), %r15 + + leaq (%rsi, %rdx), %rbp + roll $7, %ebp + xorl %ebp, %edi + leaq (%r9, %r15), %rbp + roll $7, %ebp + xorl %ebp, %eax + leaq (%rdx, %rdi), %rbp + roll $9, %ebp + xorl %ebp, %ecx + leaq (%r15, %rax), %rbp + roll $9, %ebp + xorl %ebp, %r8d + + leaq (%rdi, %rcx), %rbp + roll $13, %ebp + xorl %ebp, %esi + leaq (%rax, %r8), %rbp + roll $13, %ebp + xorl %ebp, %r9d + leaq (%rcx, %rsi), %rbp + roll $18, %ebp + xorl %ebp, %edx + leaq (%r8, %r9), %rbp + roll $18, %ebp + xorl %ebp, %r15d + + movq 48(%rsp), %rbp + movq %r15, 72(%rsp) + + leaq (%r10, %rbp), %r15 + roll $7, %r15d + xorl %r15d, %r12d + leaq (%rbp, %r12), %r15 + roll $9, %r15d + xorl %r15d, %r11d + leaq (%r12, %r11), %r15 + roll $13, %r15d + xorl %r15d, %r10d + leaq (%r11, %r10), %r15 + roll $18, %r15d + xorl %r15d, %ebp + + movq 88(%rsp), %r15 + movq %rbp, 48(%rsp) + + leaq (%rbx, %r15), %rbp + roll $7, %ebp + xorl %ebp, %r14d + leaq (%r15, %r14), %rbp + roll $9, %ebp + xorl %ebp, %r13d + leaq (%r14, %r13), %rbp + roll $13, %ebp + xorl %ebp, %ebx + leaq (%r13, %rbx), %rbp + roll $18, %ebp + xorl %ebp, %r15d + + movq %r15, 88(%rsp) +.endm + + .text + .p2align 6 +salsa8_core_gen: + /* 0: %rdx, %rdi, %rcx, %rsi */ + movq 8(%rsp), %rdi + movq %rdi, %rdx + shrq $32, %rdi + movq 16(%rsp), %rsi + movq %rsi, %rcx + shrq $32, %rsi + /* 1: %r9, 72(%rsp), %rax, %r8 */ + movq 24(%rsp), %r8 + movq %r8, %r9 + shrq $32, %r8 + movq %r8, 72(%rsp) + movq 32(%rsp), %r8 + movq %r8, %rax + shrq $32, %r8 + /* 2: %r11, %r10, 48(%rsp), %r12 */ + movq 40(%rsp), %r10 + movq %r10, %r11 + shrq $32, %r10 + movq 48(%rsp), %r12 + /* movq %r12, %r13 */ + /* movq %r13, 48(%rsp) */ + shrq $32, %r12 + /* 3: %r14, %r13, %rbx, 88(%rsp) */ + movq 56(%rsp), %r13 + movq %r13, %r14 + shrq $32, %r13 + movq 64(%rsp), %r15 + movq %r15, %rbx + shrq $32, %r15 + movq %r15, 88(%rsp) + + salsa8_core_gen_doubleround + salsa8_core_gen_doubleround + salsa8_core_gen_doubleround + salsa8_core_gen_doubleround + + shlq $32, %rdi + xorq %rdi, %rdx + movq %rdx, 24(%rsp) + + shlq $32, %rsi + xorq %rsi, %rcx + movq %rcx, 32(%rsp) + + movl 72(%rsp), %edi + shlq $32, %rdi + xorq %rdi, %r9 + movq %r9, 40(%rsp) + + movl 48(%rsp), %ebp + shlq $32, %r8 + xorq %r8, %rax + movq %rax, 48(%rsp) + + shlq $32, %r10 + xorq %r10, %r11 + movq %r11, 56(%rsp) + + shlq $32, %r12 + xorq %r12, %rbp + movq %rbp, 64(%rsp) + + shlq $32, %r13 + xorq %r13, %r14 + movq %r14, 72(%rsp) + + movdqa 24(%rsp), %xmm0 + + shlq $32, %r15 + xorq %r15, %rbx + movq %rbx, 80(%rsp) + + movdqa 40(%rsp), %xmm1 + movdqa 56(%rsp), %xmm2 + movdqa 72(%rsp), %xmm3 + + ret + + + .text + .p2align 6 + .globl scrypt_core + .globl _scrypt_core +scrypt_core: +_scrypt_core: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +#if defined(_WIN64) || defined(__CYGWIN__) + subq $176, %rsp + movdqa %xmm6, 8(%rsp) + movdqa %xmm7, 24(%rsp) + movdqa %xmm8, 40(%rsp) + movdqa %xmm9, 56(%rsp) + movdqa %xmm10, 72(%rsp) + movdqa %xmm11, 88(%rsp) + movdqa %xmm12, 104(%rsp) + movdqa %xmm13, 120(%rsp) + movdqa %xmm14, 136(%rsp) + movdqa %xmm15, 152(%rsp) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + +.macro scrypt_core_cleanup +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi + movdqa 8(%rsp), %xmm6 + movdqa 24(%rsp), %xmm7 + movdqa 40(%rsp), %xmm8 + movdqa 56(%rsp), %xmm9 + movdqa 72(%rsp), %xmm10 + movdqa 88(%rsp), %xmm11 + movdqa 104(%rsp), %xmm12 + movdqa 120(%rsp), %xmm13 + movdqa 136(%rsp), %xmm14 + movdqa 152(%rsp), %xmm15 + addq $176, %rsp +#endif + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx +.endm + + /* GenuineIntel processors have fast SIMD */ + xorl %eax, %eax + cpuid + cmpl $0x6c65746e, %ecx + jne scrypt_core_gen + cmpl $0x49656e69, %edx + jne scrypt_core_gen + cmpl $0x756e6547, %ebx + je scrypt_core_xmm + + .p2align 6 +scrypt_core_gen: + subq $136, %rsp + movdqa 0(%rdi), %xmm8 + movdqa 16(%rdi), %xmm9 + movdqa 32(%rdi), %xmm10 + movdqa 48(%rdi), %xmm11 + movdqa 64(%rdi), %xmm12 + movdqa 80(%rdi), %xmm13 + movdqa 96(%rdi), %xmm14 + movdqa 112(%rdi), %xmm15 + + leaq 131072(%rsi), %rcx + movq %rdi, 104(%rsp) + movq %rsi, 112(%rsp) + movq %rcx, 120(%rsp) +scrypt_core_gen_loop1: + movdqa %xmm8, 0(%rsi) + movdqa %xmm9, 16(%rsi) + movdqa %xmm10, 32(%rsi) + movdqa %xmm11, 48(%rsi) + movdqa %xmm12, 64(%rsi) + movdqa %xmm13, 80(%rsi) + movdqa %xmm14, 96(%rsi) + movdqa %xmm15, 112(%rsi) + + pxor %xmm12, %xmm8 + pxor %xmm13, %xmm9 + pxor %xmm14, %xmm10 + pxor %xmm15, %xmm11 + movdqa %xmm8, 0(%rsp) + movdqa %xmm9, 16(%rsp) + movdqa %xmm10, 32(%rsp) + movdqa %xmm11, 48(%rsp) + movq %rsi, 128(%rsp) + call salsa8_core_gen + paddd %xmm0, %xmm8 + paddd %xmm1, %xmm9 + paddd %xmm2, %xmm10 + paddd %xmm3, %xmm11 + + pxor %xmm8, %xmm12 + pxor %xmm9, %xmm13 + pxor %xmm10, %xmm14 + pxor %xmm11, %xmm15 + movdqa %xmm12, 0(%rsp) + movdqa %xmm13, 16(%rsp) + movdqa %xmm14, 32(%rsp) + movdqa %xmm15, 48(%rsp) + call salsa8_core_gen + movq 128(%rsp), %rsi + paddd %xmm0, %xmm12 + paddd %xmm1, %xmm13 + paddd %xmm2, %xmm14 + paddd %xmm3, %xmm15 + + addq $128, %rsi + movq 120(%rsp), %rcx + cmpq %rcx, %rsi + jne scrypt_core_gen_loop1 + + movq $1024, %rcx + movd %xmm12, %edx +scrypt_core_gen_loop2: + movq 112(%rsp), %rsi + andl $1023, %edx + shll $7, %edx + addq %rsi, %rdx + movdqa 0(%rdx), %xmm0 + movdqa 16(%rdx), %xmm1 + movdqa 32(%rdx), %xmm2 + movdqa 48(%rdx), %xmm3 + movdqa 64(%rdx), %xmm4 + movdqa 80(%rdx), %xmm5 + movdqa 96(%rdx), %xmm6 + movdqa 112(%rdx), %xmm7 + pxor %xmm0, %xmm8 + pxor %xmm1, %xmm9 + pxor %xmm2, %xmm10 + pxor %xmm3, %xmm11 + pxor %xmm4, %xmm12 + pxor %xmm5, %xmm13 + pxor %xmm6, %xmm14 + pxor %xmm7, %xmm15 + + pxor %xmm12, %xmm8 + pxor %xmm13, %xmm9 + pxor %xmm14, %xmm10 + pxor %xmm15, %xmm11 + movdqa %xmm8, 0(%rsp) + movdqa %xmm9, 16(%rsp) + movdqa %xmm10, 32(%rsp) + movdqa %xmm11, 48(%rsp) + movq %rcx, 128(%rsp) + call salsa8_core_gen + paddd %xmm0, %xmm8 + paddd %xmm1, %xmm9 + paddd %xmm2, %xmm10 + paddd %xmm3, %xmm11 + + pxor %xmm8, %xmm12 + pxor %xmm9, %xmm13 + pxor %xmm10, %xmm14 + pxor %xmm11, %xmm15 + movdqa %xmm12, 0(%rsp) + movdqa %xmm13, 16(%rsp) + movdqa %xmm14, 32(%rsp) + movdqa %xmm15, 48(%rsp) + call salsa8_core_gen + movq 128(%rsp), %rcx + addl 0(%rsp), %edx + paddd %xmm0, %xmm12 + paddd %xmm1, %xmm13 + paddd %xmm2, %xmm14 + paddd %xmm3, %xmm15 + + subq $1, %rcx + ja scrypt_core_gen_loop2 + + movq 104(%rsp), %rdi + movdqa %xmm8, 0(%rdi) + movdqa %xmm9, 16(%rdi) + movdqa %xmm10, 32(%rdi) + movdqa %xmm11, 48(%rdi) + movdqa %xmm12, 64(%rdi) + movdqa %xmm13, 80(%rdi) + movdqa %xmm14, 96(%rdi) + movdqa %xmm15, 112(%rdi) + + addq $136, %rsp + scrypt_core_cleanup + ret + + +.macro salsa8_core_xmm_doubleround + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 +.endm + +.macro salsa8_core_xmm + salsa8_core_xmm_doubleround + salsa8_core_xmm_doubleround + salsa8_core_xmm_doubleround + salsa8_core_xmm_doubleround +.endm + + .p2align 6 +scrypt_core_xmm: + pcmpeqw %xmm1, %xmm1 + psrlq $32, %xmm1 + + movdqa 0(%rdi), %xmm8 + movdqa 16(%rdi), %xmm11 + movdqa 32(%rdi), %xmm10 + movdqa 48(%rdi), %xmm9 + movdqa %xmm8, %xmm0 + pxor %xmm11, %xmm8 + pand %xmm1, %xmm8 + pxor %xmm11, %xmm8 + pxor %xmm10, %xmm11 + pand %xmm1, %xmm11 + pxor %xmm10, %xmm11 + pxor %xmm9, %xmm10 + pand %xmm1, %xmm10 + pxor %xmm9, %xmm10 + pxor %xmm0, %xmm9 + pand %xmm1, %xmm9 + pxor %xmm0, %xmm9 + movdqa %xmm8, %xmm0 + pshufd $0x4e, %xmm10, %xmm10 + punpcklqdq %xmm10, %xmm8 + punpckhqdq %xmm0, %xmm10 + movdqa %xmm11, %xmm0 + pshufd $0x4e, %xmm9, %xmm9 + punpcklqdq %xmm9, %xmm11 + punpckhqdq %xmm0, %xmm9 + + movdqa 64(%rdi), %xmm12 + movdqa 80(%rdi), %xmm15 + movdqa 96(%rdi), %xmm14 + movdqa 112(%rdi), %xmm13 + movdqa %xmm12, %xmm0 + pxor %xmm15, %xmm12 + pand %xmm1, %xmm12 + pxor %xmm15, %xmm12 + pxor %xmm14, %xmm15 + pand %xmm1, %xmm15 + pxor %xmm14, %xmm15 + pxor %xmm13, %xmm14 + pand %xmm1, %xmm14 + pxor %xmm13, %xmm14 + pxor %xmm0, %xmm13 + pand %xmm1, %xmm13 + pxor %xmm0, %xmm13 + movdqa %xmm12, %xmm0 + pshufd $0x4e, %xmm14, %xmm14 + punpcklqdq %xmm14, %xmm12 + punpckhqdq %xmm0, %xmm14 + movdqa %xmm15, %xmm0 + pshufd $0x4e, %xmm13, %xmm13 + punpcklqdq %xmm13, %xmm15 + punpckhqdq %xmm0, %xmm13 + + movq %rsi, %rdx + leaq 131072(%rsi), %rcx +scrypt_core_xmm_loop1: + pxor %xmm12, %xmm8 + pxor %xmm13, %xmm9 + pxor %xmm14, %xmm10 + pxor %xmm15, %xmm11 + movdqa %xmm8, 0(%rdx) + movdqa %xmm9, 16(%rdx) + movdqa %xmm10, 32(%rdx) + movdqa %xmm11, 48(%rdx) + movdqa %xmm12, 64(%rdx) + movdqa %xmm13, 80(%rdx) + movdqa %xmm14, 96(%rdx) + movdqa %xmm15, 112(%rdx) + + movdqa %xmm8, %xmm0 + movdqa %xmm9, %xmm1 + movdqa %xmm10, %xmm2 + movdqa %xmm11, %xmm3 + salsa8_core_xmm + paddd %xmm0, %xmm8 + paddd %xmm1, %xmm9 + paddd %xmm2, %xmm10 + paddd %xmm3, %xmm11 + + pxor %xmm8, %xmm12 + pxor %xmm9, %xmm13 + pxor %xmm10, %xmm14 + pxor %xmm11, %xmm15 + movdqa %xmm12, %xmm0 + movdqa %xmm13, %xmm1 + movdqa %xmm14, %xmm2 + movdqa %xmm15, %xmm3 + salsa8_core_xmm + paddd %xmm0, %xmm12 + paddd %xmm1, %xmm13 + paddd %xmm2, %xmm14 + paddd %xmm3, %xmm15 + + addq $128, %rdx + cmpq %rcx, %rdx + jne scrypt_core_xmm_loop1 + + movq $1024, %rcx +scrypt_core_xmm_loop2: + movd %xmm12, %edx + andl $1023, %edx + shll $7, %edx + pxor 0(%rsi, %rdx), %xmm8 + pxor 16(%rsi, %rdx), %xmm9 + pxor 32(%rsi, %rdx), %xmm10 + pxor 48(%rsi, %rdx), %xmm11 + + pxor %xmm12, %xmm8 + pxor %xmm13, %xmm9 + pxor %xmm14, %xmm10 + pxor %xmm15, %xmm11 + movdqa %xmm8, %xmm0 + movdqa %xmm9, %xmm1 + movdqa %xmm10, %xmm2 + movdqa %xmm11, %xmm3 + salsa8_core_xmm + paddd %xmm0, %xmm8 + paddd %xmm1, %xmm9 + paddd %xmm2, %xmm10 + paddd %xmm3, %xmm11 + + pxor 64(%rsi, %rdx), %xmm12 + pxor 80(%rsi, %rdx), %xmm13 + pxor 96(%rsi, %rdx), %xmm14 + pxor 112(%rsi, %rdx), %xmm15 + pxor %xmm8, %xmm12 + pxor %xmm9, %xmm13 + pxor %xmm10, %xmm14 + pxor %xmm11, %xmm15 + movdqa %xmm12, %xmm0 + movdqa %xmm13, %xmm1 + movdqa %xmm14, %xmm2 + movdqa %xmm15, %xmm3 + salsa8_core_xmm + paddd %xmm0, %xmm12 + paddd %xmm1, %xmm13 + paddd %xmm2, %xmm14 + paddd %xmm3, %xmm15 + + subq $1, %rcx + ja scrypt_core_xmm_loop2 + + pcmpeqw %xmm1, %xmm1 + psrlq $32, %xmm1 + + movdqa %xmm8, %xmm0 + pxor %xmm9, %xmm8 + pand %xmm1, %xmm8 + pxor %xmm9, %xmm8 + pxor %xmm10, %xmm9 + pand %xmm1, %xmm9 + pxor %xmm10, %xmm9 + pxor %xmm11, %xmm10 + pand %xmm1, %xmm10 + pxor %xmm11, %xmm10 + pxor %xmm0, %xmm11 + pand %xmm1, %xmm11 + pxor %xmm0, %xmm11 + movdqa %xmm8, %xmm0 + pshufd $0x4e, %xmm10, %xmm10 + punpcklqdq %xmm10, %xmm8 + punpckhqdq %xmm0, %xmm10 + movdqa %xmm9, %xmm0 + pshufd $0x4e, %xmm11, %xmm11 + punpcklqdq %xmm11, %xmm9 + punpckhqdq %xmm0, %xmm11 + movdqa %xmm8, 0(%rdi) + movdqa %xmm11, 16(%rdi) + movdqa %xmm10, 32(%rdi) + movdqa %xmm9, 48(%rdi) + + movdqa %xmm12, %xmm0 + pxor %xmm13, %xmm12 + pand %xmm1, %xmm12 + pxor %xmm13, %xmm12 + pxor %xmm14, %xmm13 + pand %xmm1, %xmm13 + pxor %xmm14, %xmm13 + pxor %xmm15, %xmm14 + pand %xmm1, %xmm14 + pxor %xmm15, %xmm14 + pxor %xmm0, %xmm15 + pand %xmm1, %xmm15 + pxor %xmm0, %xmm15 + movdqa %xmm12, %xmm0 + pshufd $0x4e, %xmm14, %xmm14 + punpcklqdq %xmm14, %xmm12 + punpckhqdq %xmm0, %xmm14 + movdqa %xmm13, %xmm0 + pshufd $0x4e, %xmm15, %xmm15 + punpcklqdq %xmm15, %xmm13 + punpckhqdq %xmm0, %xmm15 + movdqa %xmm12, 64(%rdi) + movdqa %xmm15, 80(%rdi) + movdqa %xmm14, 96(%rdi) + movdqa %xmm13, 112(%rdi) + + scrypt_core_cleanup + ret + + +#if defined(USE_AVX) +.macro salsa8_core_3way_avx_doubleround + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vpslld $7, %xmm4, %xmm5 + vpsrld $25, %xmm4, %xmm4 + vpshufd $0x39, %xmm1, %xmm1 + vpxor %xmm5, %xmm1, %xmm1 + vpxor %xmm4, %xmm1, %xmm1 + vpslld $7, %xmm6, %xmm5 + vpsrld $25, %xmm6, %xmm6 + vpshufd $0x39, %xmm9, %xmm9 + vpxor %xmm5, %xmm9, %xmm9 + vpxor %xmm6, %xmm9, %xmm9 + vpslld $7, %xmm7, %xmm5 + vpsrld $25, %xmm7, %xmm7 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm5, %xmm13, %xmm13 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vpslld $9, %xmm4, %xmm5 + vpsrld $23, %xmm4, %xmm4 + vpxor %xmm5, %xmm2, %xmm2 + vpxor %xmm4, %xmm2, %xmm2 + vpslld $9, %xmm6, %xmm5 + vpsrld $23, %xmm6, %xmm6 + vpxor %xmm5, %xmm10, %xmm10 + vpxor %xmm6, %xmm10, %xmm10 + vpslld $9, %xmm7, %xmm5 + vpsrld $23, %xmm7, %xmm7 + vpxor %xmm5, %xmm14, %xmm14 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vpslld $13, %xmm4, %xmm5 + vpsrld $19, %xmm4, %xmm4 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm5, %xmm3, %xmm3 + vpxor %xmm4, %xmm3, %xmm3 + vpslld $13, %xmm6, %xmm5 + vpsrld $19, %xmm6, %xmm6 + vpxor %xmm5, %xmm11, %xmm11 + vpxor %xmm6, %xmm11, %xmm11 + vpslld $13, %xmm7, %xmm5 + vpsrld $19, %xmm7, %xmm7 + vpxor %xmm5, %xmm15, %xmm15 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vpslld $18, %xmm4, %xmm5 + vpsrld $14, %xmm4, %xmm4 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpxor %xmm5, %xmm0, %xmm0 + vpxor %xmm4, %xmm0, %xmm0 + vpslld $18, %xmm6, %xmm5 + vpsrld $14, %xmm6, %xmm6 + vpshufd $0x4e, %xmm14, %xmm14 + vpshufd $0x39, %xmm11, %xmm11 + vpxor %xmm5, %xmm8, %xmm8 + vpxor %xmm6, %xmm8, %xmm8 + vpslld $18, %xmm7, %xmm5 + vpsrld $14, %xmm7, %xmm7 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm15, %xmm15 + vpxor %xmm5, %xmm12, %xmm12 + vpxor %xmm7, %xmm12, %xmm12 +.endm + +.macro salsa8_core_3way_avx + salsa8_core_3way_avx_doubleround + salsa8_core_3way_avx_doubleround + salsa8_core_3way_avx_doubleround + salsa8_core_3way_avx_doubleround +.endm +#endif /* USE_AVX */ + + .text + .p2align 6 + .globl scrypt_core_3way + .globl _scrypt_core_3way +scrypt_core_3way: +_scrypt_core_3way: + pushq %rbx + pushq %rbp +#if defined(_WIN64) || defined(__CYGWIN__) + subq $176, %rsp + movdqa %xmm6, 8(%rsp) + movdqa %xmm7, 24(%rsp) + movdqa %xmm8, 40(%rsp) + movdqa %xmm9, 56(%rsp) + movdqa %xmm10, 72(%rsp) + movdqa %xmm11, 88(%rsp) + movdqa %xmm12, 104(%rsp) + movdqa %xmm13, 120(%rsp) + movdqa %xmm14, 136(%rsp) + movdqa %xmm15, 152(%rsp) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + subq $392, %rsp + +.macro scrypt_core_3way_cleanup + addq $392, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi + movdqa 8(%rsp), %xmm6 + movdqa 24(%rsp), %xmm7 + movdqa 40(%rsp), %xmm8 + movdqa 56(%rsp), %xmm9 + movdqa 72(%rsp), %xmm10 + movdqa 88(%rsp), %xmm11 + movdqa 104(%rsp), %xmm12 + movdqa 120(%rsp), %xmm13 + movdqa 136(%rsp), %xmm14 + movdqa 152(%rsp), %xmm15 + addq $176, %rsp +#endif + popq %rbp + popq %rbx +.endm + +#if !defined(USE_AVX) + jmp scrypt_core_3way_xmm +#else + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne scrypt_core_3way_xmm + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne scrypt_core_3way_xmm +#if defined(USE_XOP) + /* Check for XOP support */ + movl $0x80000001, %eax + cpuid + andl $0x00000800, %ecx + jnz scrypt_core_3way_xop +#endif + +scrypt_core_3way_avx: + scrypt_shuffle %rdi, 0, %rsp, 0 + scrypt_shuffle %rdi, 64, %rsp, 64 + scrypt_shuffle %rdi, 128, %rsp, 128 + scrypt_shuffle %rdi, 192, %rsp, 192 + scrypt_shuffle %rdi, 256, %rsp, 256 + scrypt_shuffle %rdi, 320, %rsp, 320 + + movdqa 64(%rsp), %xmm0 + movdqa 80(%rsp), %xmm1 + movdqa 96(%rsp), %xmm2 + movdqa 112(%rsp), %xmm3 + movdqa 128+64(%rsp), %xmm8 + movdqa 128+80(%rsp), %xmm9 + movdqa 128+96(%rsp), %xmm10 + movdqa 128+112(%rsp), %xmm11 + movdqa 256+64(%rsp), %xmm12 + movdqa 256+80(%rsp), %xmm13 + movdqa 256+96(%rsp), %xmm14 + movdqa 256+112(%rsp), %xmm15 + + movq %rsi, %rbx + leaq 3*131072(%rsi), %rax +scrypt_core_3way_avx_loop1: + movdqa %xmm0, 64(%rbx) + movdqa %xmm1, 80(%rbx) + movdqa %xmm2, 96(%rbx) + movdqa %xmm3, 112(%rbx) + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + movdqa %xmm8, 128+64(%rbx) + movdqa %xmm9, 128+80(%rbx) + movdqa %xmm10, 128+96(%rbx) + movdqa %xmm11, 128+112(%rbx) + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + movdqa %xmm12, 256+64(%rbx) + movdqa %xmm13, 256+80(%rbx) + movdqa %xmm14, 256+96(%rbx) + movdqa %xmm15, 256+112(%rbx) + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rbx) + movdqa %xmm1, 16(%rbx) + movdqa %xmm2, 32(%rbx) + movdqa %xmm3, 48(%rbx) + movdqa %xmm8, 128+0(%rbx) + movdqa %xmm9, 128+16(%rbx) + movdqa %xmm10, 128+32(%rbx) + movdqa %xmm11, 128+48(%rbx) + movdqa %xmm12, 256+0(%rbx) + movdqa %xmm13, 256+16(%rbx) + movdqa %xmm14, 256+32(%rbx) + movdqa %xmm15, 256+48(%rbx) + + salsa8_core_3way_avx + paddd 0(%rbx), %xmm0 + paddd 16(%rbx), %xmm1 + paddd 32(%rbx), %xmm2 + paddd 48(%rbx), %xmm3 + paddd 128+0(%rbx), %xmm8 + paddd 128+16(%rbx), %xmm9 + paddd 128+32(%rbx), %xmm10 + paddd 128+48(%rbx), %xmm11 + paddd 256+0(%rbx), %xmm12 + paddd 256+16(%rbx), %xmm13 + paddd 256+32(%rbx), %xmm14 + paddd 256+48(%rbx), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rbx), %xmm0 + pxor 80(%rbx), %xmm1 + pxor 96(%rbx), %xmm2 + pxor 112(%rbx), %xmm3 + pxor 128+64(%rbx), %xmm8 + pxor 128+80(%rbx), %xmm9 + pxor 128+96(%rbx), %xmm10 + pxor 128+112(%rbx), %xmm11 + pxor 256+64(%rbx), %xmm12 + pxor 256+80(%rbx), %xmm13 + pxor 256+96(%rbx), %xmm14 + pxor 256+112(%rbx), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + salsa8_core_3way_avx + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + + addq $3*128, %rbx + cmpq %rax, %rbx + jne scrypt_core_3way_avx_loop1 + + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + movq $1024, %rcx +scrypt_core_3way_avx_loop2: + movd %xmm0, %ebp + movd %xmm8, %ebx + movd %xmm12, %eax + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + andl $1023, %ebp + leaq (%rbp, %rbp, 2), %rbp + shll $7, %ebp + andl $1023, %ebx + leaq 1(%rbx, %rbx, 2), %rbx + shll $7, %ebx + andl $1023, %eax + leaq 2(%rax, %rax, 2), %rax + shll $7, %eax + pxor 0(%rsi, %rbp), %xmm0 + pxor 16(%rsi, %rbp), %xmm1 + pxor 32(%rsi, %rbp), %xmm2 + pxor 48(%rsi, %rbp), %xmm3 + pxor 0(%rsi, %rbx), %xmm8 + pxor 16(%rsi, %rbx), %xmm9 + pxor 32(%rsi, %rbx), %xmm10 + pxor 48(%rsi, %rbx), %xmm11 + pxor 0(%rsi, %rax), %xmm12 + pxor 16(%rsi, %rax), %xmm13 + pxor 32(%rsi, %rax), %xmm14 + pxor 48(%rsi, %rax), %xmm15 + + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + salsa8_core_3way_avx + paddd 0(%rsp), %xmm0 + paddd 16(%rsp), %xmm1 + paddd 32(%rsp), %xmm2 + paddd 48(%rsp), %xmm3 + paddd 128+0(%rsp), %xmm8 + paddd 128+16(%rsp), %xmm9 + paddd 128+32(%rsp), %xmm10 + paddd 128+48(%rsp), %xmm11 + paddd 256+0(%rsp), %xmm12 + paddd 256+16(%rsp), %xmm13 + paddd 256+32(%rsp), %xmm14 + paddd 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rsi, %rbp), %xmm0 + pxor 80(%rsi, %rbp), %xmm1 + pxor 96(%rsi, %rbp), %xmm2 + pxor 112(%rsi, %rbp), %xmm3 + pxor 64(%rsi, %rbx), %xmm8 + pxor 80(%rsi, %rbx), %xmm9 + pxor 96(%rsi, %rbx), %xmm10 + pxor 112(%rsi, %rbx), %xmm11 + pxor 64(%rsi, %rax), %xmm12 + pxor 80(%rsi, %rax), %xmm13 + pxor 96(%rsi, %rax), %xmm14 + pxor 112(%rsi, %rax), %xmm15 + pxor 64(%rsp), %xmm0 + pxor 80(%rsp), %xmm1 + pxor 96(%rsp), %xmm2 + pxor 112(%rsp), %xmm3 + pxor 128+64(%rsp), %xmm8 + pxor 128+80(%rsp), %xmm9 + pxor 128+96(%rsp), %xmm10 + pxor 128+112(%rsp), %xmm11 + pxor 256+64(%rsp), %xmm12 + pxor 256+80(%rsp), %xmm13 + pxor 256+96(%rsp), %xmm14 + pxor 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + salsa8_core_3way_avx + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + subq $1, %rcx + ja scrypt_core_3way_avx_loop2 + + scrypt_shuffle %rsp, 0, %rdi, 0 + scrypt_shuffle %rsp, 64, %rdi, 64 + scrypt_shuffle %rsp, 128, %rdi, 128 + scrypt_shuffle %rsp, 192, %rdi, 192 + scrypt_shuffle %rsp, 256, %rdi, 256 + scrypt_shuffle %rsp, 320, %rdi, 320 + + scrypt_core_3way_cleanup + ret + +#if defined(USE_XOP) +.macro salsa8_core_3way_xop_doubleround + vpaddd %xmm0, %xmm1, %xmm4 + vpaddd %xmm8, %xmm9, %xmm6 + vpaddd %xmm12, %xmm13, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm0, %xmm4 + vpaddd %xmm11, %xmm8, %xmm6 + vpaddd %xmm15, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm3, %xmm4 + vpaddd %xmm10, %xmm11, %xmm6 + vpaddd %xmm14, %xmm15, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm3, %xmm3 + vpshufd $0x93, %xmm11, %xmm11 + vpshufd $0x93, %xmm15, %xmm15 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm2, %xmm4 + vpaddd %xmm9, %xmm10, %xmm6 + vpaddd %xmm13, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm7, %xmm12, %xmm12 + + vpaddd %xmm0, %xmm3, %xmm4 + vpaddd %xmm8, %xmm11, %xmm6 + vpaddd %xmm12, %xmm15, %xmm7 + vprotd $7, %xmm4, %xmm4 + vprotd $7, %xmm6, %xmm6 + vprotd $7, %xmm7, %xmm7 + vpshufd $0x39, %xmm1, %xmm1 + vpshufd $0x39, %xmm9, %xmm9 + vpshufd $0x39, %xmm13, %xmm13 + vpxor %xmm4, %xmm1, %xmm1 + vpxor %xmm6, %xmm9, %xmm9 + vpxor %xmm7, %xmm13, %xmm13 + + vpaddd %xmm1, %xmm0, %xmm4 + vpaddd %xmm9, %xmm8, %xmm6 + vpaddd %xmm13, %xmm12, %xmm7 + vprotd $9, %xmm4, %xmm4 + vprotd $9, %xmm6, %xmm6 + vprotd $9, %xmm7, %xmm7 + vpxor %xmm4, %xmm2, %xmm2 + vpxor %xmm6, %xmm10, %xmm10 + vpxor %xmm7, %xmm14, %xmm14 + + vpaddd %xmm2, %xmm1, %xmm4 + vpaddd %xmm10, %xmm9, %xmm6 + vpaddd %xmm14, %xmm13, %xmm7 + vprotd $13, %xmm4, %xmm4 + vprotd $13, %xmm6, %xmm6 + vprotd $13, %xmm7, %xmm7 + vpshufd $0x93, %xmm1, %xmm1 + vpshufd $0x93, %xmm9, %xmm9 + vpshufd $0x93, %xmm13, %xmm13 + vpxor %xmm4, %xmm3, %xmm3 + vpxor %xmm6, %xmm11, %xmm11 + vpxor %xmm7, %xmm15, %xmm15 + + vpaddd %xmm3, %xmm2, %xmm4 + vpaddd %xmm11, %xmm10, %xmm6 + vpaddd %xmm15, %xmm14, %xmm7 + vprotd $18, %xmm4, %xmm4 + vprotd $18, %xmm6, %xmm6 + vprotd $18, %xmm7, %xmm7 + vpshufd $0x4e, %xmm2, %xmm2 + vpshufd $0x4e, %xmm10, %xmm10 + vpshufd $0x4e, %xmm14, %xmm14 + vpxor %xmm4, %xmm0, %xmm0 + vpxor %xmm6, %xmm8, %xmm8 + vpxor %xmm7, %xmm12, %xmm12 + vpshufd $0x39, %xmm3, %xmm3 + vpshufd $0x39, %xmm11, %xmm11 + vpshufd $0x39, %xmm15, %xmm15 +.endm + +.macro salsa8_core_3way_xop + salsa8_core_3way_xop_doubleround + salsa8_core_3way_xop_doubleround + salsa8_core_3way_xop_doubleround + salsa8_core_3way_xop_doubleround +.endm + + .p2align 6 +scrypt_core_3way_xop: + scrypt_shuffle %rdi, 0, %rsp, 0 + scrypt_shuffle %rdi, 64, %rsp, 64 + scrypt_shuffle %rdi, 128, %rsp, 128 + scrypt_shuffle %rdi, 192, %rsp, 192 + scrypt_shuffle %rdi, 256, %rsp, 256 + scrypt_shuffle %rdi, 320, %rsp, 320 + + movdqa 64(%rsp), %xmm0 + movdqa 80(%rsp), %xmm1 + movdqa 96(%rsp), %xmm2 + movdqa 112(%rsp), %xmm3 + movdqa 128+64(%rsp), %xmm8 + movdqa 128+80(%rsp), %xmm9 + movdqa 128+96(%rsp), %xmm10 + movdqa 128+112(%rsp), %xmm11 + movdqa 256+64(%rsp), %xmm12 + movdqa 256+80(%rsp), %xmm13 + movdqa 256+96(%rsp), %xmm14 + movdqa 256+112(%rsp), %xmm15 + + movq %rsi, %rbx + leaq 3*131072(%rsi), %rax +scrypt_core_3way_xop_loop1: + movdqa %xmm0, 64(%rbx) + movdqa %xmm1, 80(%rbx) + movdqa %xmm2, 96(%rbx) + movdqa %xmm3, 112(%rbx) + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + movdqa %xmm8, 128+64(%rbx) + movdqa %xmm9, 128+80(%rbx) + movdqa %xmm10, 128+96(%rbx) + movdqa %xmm11, 128+112(%rbx) + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + movdqa %xmm12, 256+64(%rbx) + movdqa %xmm13, 256+80(%rbx) + movdqa %xmm14, 256+96(%rbx) + movdqa %xmm15, 256+112(%rbx) + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rbx) + movdqa %xmm1, 16(%rbx) + movdqa %xmm2, 32(%rbx) + movdqa %xmm3, 48(%rbx) + movdqa %xmm8, 128+0(%rbx) + movdqa %xmm9, 128+16(%rbx) + movdqa %xmm10, 128+32(%rbx) + movdqa %xmm11, 128+48(%rbx) + movdqa %xmm12, 256+0(%rbx) + movdqa %xmm13, 256+16(%rbx) + movdqa %xmm14, 256+32(%rbx) + movdqa %xmm15, 256+48(%rbx) + + salsa8_core_3way_xop + paddd 0(%rbx), %xmm0 + paddd 16(%rbx), %xmm1 + paddd 32(%rbx), %xmm2 + paddd 48(%rbx), %xmm3 + paddd 128+0(%rbx), %xmm8 + paddd 128+16(%rbx), %xmm9 + paddd 128+32(%rbx), %xmm10 + paddd 128+48(%rbx), %xmm11 + paddd 256+0(%rbx), %xmm12 + paddd 256+16(%rbx), %xmm13 + paddd 256+32(%rbx), %xmm14 + paddd 256+48(%rbx), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rbx), %xmm0 + pxor 80(%rbx), %xmm1 + pxor 96(%rbx), %xmm2 + pxor 112(%rbx), %xmm3 + pxor 128+64(%rbx), %xmm8 + pxor 128+80(%rbx), %xmm9 + pxor 128+96(%rbx), %xmm10 + pxor 128+112(%rbx), %xmm11 + pxor 256+64(%rbx), %xmm12 + pxor 256+80(%rbx), %xmm13 + pxor 256+96(%rbx), %xmm14 + pxor 256+112(%rbx), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + salsa8_core_3way_xop + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + + addq $3*128, %rbx + cmpq %rax, %rbx + jne scrypt_core_3way_xop_loop1 + + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + movq $1024, %rcx +scrypt_core_3way_xop_loop2: + movd %xmm0, %ebp + movd %xmm8, %ebx + movd %xmm12, %eax + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + andl $1023, %ebp + leaq (%rbp, %rbp, 2), %rbp + shll $7, %ebp + andl $1023, %ebx + leaq 1(%rbx, %rbx, 2), %rbx + shll $7, %ebx + andl $1023, %eax + leaq 2(%rax, %rax, 2), %rax + shll $7, %eax + pxor 0(%rsi, %rbp), %xmm0 + pxor 16(%rsi, %rbp), %xmm1 + pxor 32(%rsi, %rbp), %xmm2 + pxor 48(%rsi, %rbp), %xmm3 + pxor 0(%rsi, %rbx), %xmm8 + pxor 16(%rsi, %rbx), %xmm9 + pxor 32(%rsi, %rbx), %xmm10 + pxor 48(%rsi, %rbx), %xmm11 + pxor 0(%rsi, %rax), %xmm12 + pxor 16(%rsi, %rax), %xmm13 + pxor 32(%rsi, %rax), %xmm14 + pxor 48(%rsi, %rax), %xmm15 + + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + salsa8_core_3way_xop + paddd 0(%rsp), %xmm0 + paddd 16(%rsp), %xmm1 + paddd 32(%rsp), %xmm2 + paddd 48(%rsp), %xmm3 + paddd 128+0(%rsp), %xmm8 + paddd 128+16(%rsp), %xmm9 + paddd 128+32(%rsp), %xmm10 + paddd 128+48(%rsp), %xmm11 + paddd 256+0(%rsp), %xmm12 + paddd 256+16(%rsp), %xmm13 + paddd 256+32(%rsp), %xmm14 + paddd 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rsi, %rbp), %xmm0 + pxor 80(%rsi, %rbp), %xmm1 + pxor 96(%rsi, %rbp), %xmm2 + pxor 112(%rsi, %rbp), %xmm3 + pxor 64(%rsi, %rbx), %xmm8 + pxor 80(%rsi, %rbx), %xmm9 + pxor 96(%rsi, %rbx), %xmm10 + pxor 112(%rsi, %rbx), %xmm11 + pxor 64(%rsi, %rax), %xmm12 + pxor 80(%rsi, %rax), %xmm13 + pxor 96(%rsi, %rax), %xmm14 + pxor 112(%rsi, %rax), %xmm15 + pxor 64(%rsp), %xmm0 + pxor 80(%rsp), %xmm1 + pxor 96(%rsp), %xmm2 + pxor 112(%rsp), %xmm3 + pxor 128+64(%rsp), %xmm8 + pxor 128+80(%rsp), %xmm9 + pxor 128+96(%rsp), %xmm10 + pxor 128+112(%rsp), %xmm11 + pxor 256+64(%rsp), %xmm12 + pxor 256+80(%rsp), %xmm13 + pxor 256+96(%rsp), %xmm14 + pxor 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + salsa8_core_3way_xop + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + subq $1, %rcx + ja scrypt_core_3way_xop_loop2 + + scrypt_shuffle %rsp, 0, %rdi, 0 + scrypt_shuffle %rsp, 64, %rdi, 64 + scrypt_shuffle %rsp, 128, %rdi, 128 + scrypt_shuffle %rsp, 192, %rdi, 192 + scrypt_shuffle %rsp, 256, %rdi, 256 + scrypt_shuffle %rsp, 320, %rdi, 320 + + scrypt_core_3way_cleanup + ret +#endif /* USE_XOP */ +#endif /* USE_AVX */ + +.macro salsa8_core_3way_xmm_doubleround + movdqa %xmm1, %xmm4 + movdqa %xmm9, %xmm6 + movdqa %xmm13, %xmm7 + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + pxor %xmm5, %xmm3 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm11 + pxor %xmm5, %xmm11 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm15 + pxor %xmm5, %xmm15 + movdqa %xmm12, %xmm7 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pshufd $0x93, %xmm3, %xmm3 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm11, %xmm6 + pshufd $0x93, %xmm11, %xmm11 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm14 + pshufd $0x93, %xmm15, %xmm15 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm1 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm9 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm9 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm13 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm13 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + pxor %xmm5, %xmm0 + movdqa %xmm3, %xmm4 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm9, %xmm9 + pxor %xmm5, %xmm8 + movdqa %xmm11, %xmm6 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + movdqa %xmm15, %xmm7 + pxor %xmm5, %xmm12 + pshufd $0x39, %xmm13, %xmm13 + + paddd %xmm0, %xmm4 + paddd %xmm8, %xmm6 + paddd %xmm12, %xmm7 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + pxor %xmm5, %xmm1 + movdqa %xmm0, %xmm4 + movdqa %xmm6, %xmm5 + pslld $7, %xmm6 + psrld $25, %xmm5 + pxor %xmm6, %xmm9 + pxor %xmm5, %xmm9 + movdqa %xmm8, %xmm6 + movdqa %xmm7, %xmm5 + pslld $7, %xmm7 + psrld $25, %xmm5 + pxor %xmm7, %xmm13 + pxor %xmm5, %xmm13 + movdqa %xmm12, %xmm7 + + paddd %xmm1, %xmm4 + paddd %xmm9, %xmm6 + paddd %xmm13, %xmm7 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pshufd $0x93, %xmm1, %xmm1 + pxor %xmm5, %xmm2 + movdqa %xmm6, %xmm5 + pslld $9, %xmm6 + psrld $23, %xmm5 + pxor %xmm6, %xmm10 + movdqa %xmm9, %xmm6 + pshufd $0x93, %xmm9, %xmm9 + pxor %xmm5, %xmm10 + movdqa %xmm7, %xmm5 + pslld $9, %xmm7 + psrld $23, %xmm5 + pxor %xmm7, %xmm14 + movdqa %xmm13, %xmm7 + pshufd $0x93, %xmm13, %xmm13 + pxor %xmm5, %xmm14 + + paddd %xmm2, %xmm4 + paddd %xmm10, %xmm6 + paddd %xmm14, %xmm7 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pshufd $0x4e, %xmm2, %xmm2 + pxor %xmm5, %xmm3 + movdqa %xmm6, %xmm5 + pslld $13, %xmm6 + psrld $19, %xmm5 + pxor %xmm6, %xmm11 + movdqa %xmm10, %xmm6 + pshufd $0x4e, %xmm10, %xmm10 + pxor %xmm5, %xmm11 + movdqa %xmm7, %xmm5 + pslld $13, %xmm7 + psrld $19, %xmm5 + pxor %xmm7, %xmm15 + movdqa %xmm14, %xmm7 + pshufd $0x4e, %xmm14, %xmm14 + pxor %xmm5, %xmm15 + + paddd %xmm3, %xmm4 + paddd %xmm11, %xmm6 + paddd %xmm15, %xmm7 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 + movdqa %xmm6, %xmm5 + pslld $18, %xmm6 + psrld $14, %xmm5 + pxor %xmm6, %xmm8 + pshufd $0x39, %xmm11, %xmm11 + pxor %xmm5, %xmm8 + movdqa %xmm7, %xmm5 + pslld $18, %xmm7 + psrld $14, %xmm5 + pxor %xmm7, %xmm12 + pshufd $0x39, %xmm15, %xmm15 + pxor %xmm5, %xmm12 +.endm + +.macro salsa8_core_3way_xmm + salsa8_core_3way_xmm_doubleround + salsa8_core_3way_xmm_doubleround + salsa8_core_3way_xmm_doubleround + salsa8_core_3way_xmm_doubleround +.endm + + .p2align 6 +scrypt_core_3way_xmm: + scrypt_shuffle %rdi, 0, %rsp, 0 + scrypt_shuffle %rdi, 64, %rsp, 64 + scrypt_shuffle %rdi, 128, %rsp, 128 + scrypt_shuffle %rdi, 192, %rsp, 192 + scrypt_shuffle %rdi, 256, %rsp, 256 + scrypt_shuffle %rdi, 320, %rsp, 320 + + movdqa 64(%rsp), %xmm0 + movdqa 80(%rsp), %xmm1 + movdqa 96(%rsp), %xmm2 + movdqa 112(%rsp), %xmm3 + movdqa 128+64(%rsp), %xmm8 + movdqa 128+80(%rsp), %xmm9 + movdqa 128+96(%rsp), %xmm10 + movdqa 128+112(%rsp), %xmm11 + movdqa 256+64(%rsp), %xmm12 + movdqa 256+80(%rsp), %xmm13 + movdqa 256+96(%rsp), %xmm14 + movdqa 256+112(%rsp), %xmm15 + + movq %rsi, %rbx + leaq 3*131072(%rsi), %rax +scrypt_core_3way_xmm_loop1: + movdqa %xmm0, 64(%rbx) + movdqa %xmm1, 80(%rbx) + movdqa %xmm2, 96(%rbx) + movdqa %xmm3, 112(%rbx) + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + movdqa %xmm8, 128+64(%rbx) + movdqa %xmm9, 128+80(%rbx) + movdqa %xmm10, 128+96(%rbx) + movdqa %xmm11, 128+112(%rbx) + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + movdqa %xmm12, 256+64(%rbx) + movdqa %xmm13, 256+80(%rbx) + movdqa %xmm14, 256+96(%rbx) + movdqa %xmm15, 256+112(%rbx) + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rbx) + movdqa %xmm1, 16(%rbx) + movdqa %xmm2, 32(%rbx) + movdqa %xmm3, 48(%rbx) + movdqa %xmm8, 128+0(%rbx) + movdqa %xmm9, 128+16(%rbx) + movdqa %xmm10, 128+32(%rbx) + movdqa %xmm11, 128+48(%rbx) + movdqa %xmm12, 256+0(%rbx) + movdqa %xmm13, 256+16(%rbx) + movdqa %xmm14, 256+32(%rbx) + movdqa %xmm15, 256+48(%rbx) + + salsa8_core_3way_xmm + paddd 0(%rbx), %xmm0 + paddd 16(%rbx), %xmm1 + paddd 32(%rbx), %xmm2 + paddd 48(%rbx), %xmm3 + paddd 128+0(%rbx), %xmm8 + paddd 128+16(%rbx), %xmm9 + paddd 128+32(%rbx), %xmm10 + paddd 128+48(%rbx), %xmm11 + paddd 256+0(%rbx), %xmm12 + paddd 256+16(%rbx), %xmm13 + paddd 256+32(%rbx), %xmm14 + paddd 256+48(%rbx), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rbx), %xmm0 + pxor 80(%rbx), %xmm1 + pxor 96(%rbx), %xmm2 + pxor 112(%rbx), %xmm3 + pxor 128+64(%rbx), %xmm8 + pxor 128+80(%rbx), %xmm9 + pxor 128+96(%rbx), %xmm10 + pxor 128+112(%rbx), %xmm11 + pxor 256+64(%rbx), %xmm12 + pxor 256+80(%rbx), %xmm13 + pxor 256+96(%rbx), %xmm14 + pxor 256+112(%rbx), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + salsa8_core_3way_xmm + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + + addq $3*128, %rbx + cmpq %rax, %rbx + jne scrypt_core_3way_xmm_loop1 + + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + movq $1024, %rcx +scrypt_core_3way_xmm_loop2: + movd %xmm0, %ebp + movd %xmm8, %ebx + movd %xmm12, %eax + pxor 0(%rsp), %xmm0 + pxor 16(%rsp), %xmm1 + pxor 32(%rsp), %xmm2 + pxor 48(%rsp), %xmm3 + pxor 128+0(%rsp), %xmm8 + pxor 128+16(%rsp), %xmm9 + pxor 128+32(%rsp), %xmm10 + pxor 128+48(%rsp), %xmm11 + pxor 256+0(%rsp), %xmm12 + pxor 256+16(%rsp), %xmm13 + pxor 256+32(%rsp), %xmm14 + pxor 256+48(%rsp), %xmm15 + andl $1023, %ebp + leaq (%rbp, %rbp, 2), %rbp + shll $7, %ebp + andl $1023, %ebx + leaq 1(%rbx, %rbx, 2), %rbx + shll $7, %ebx + andl $1023, %eax + leaq 2(%rax, %rax, 2), %rax + shll $7, %eax + pxor 0(%rsi, %rbp), %xmm0 + pxor 16(%rsi, %rbp), %xmm1 + pxor 32(%rsi, %rbp), %xmm2 + pxor 48(%rsi, %rbp), %xmm3 + pxor 0(%rsi, %rbx), %xmm8 + pxor 16(%rsi, %rbx), %xmm9 + pxor 32(%rsi, %rbx), %xmm10 + pxor 48(%rsi, %rbx), %xmm11 + pxor 0(%rsi, %rax), %xmm12 + pxor 16(%rsi, %rax), %xmm13 + pxor 32(%rsi, %rax), %xmm14 + pxor 48(%rsi, %rax), %xmm15 + + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + salsa8_core_3way_xmm + paddd 0(%rsp), %xmm0 + paddd 16(%rsp), %xmm1 + paddd 32(%rsp), %xmm2 + paddd 48(%rsp), %xmm3 + paddd 128+0(%rsp), %xmm8 + paddd 128+16(%rsp), %xmm9 + paddd 128+32(%rsp), %xmm10 + paddd 128+48(%rsp), %xmm11 + paddd 256+0(%rsp), %xmm12 + paddd 256+16(%rsp), %xmm13 + paddd 256+32(%rsp), %xmm14 + paddd 256+48(%rsp), %xmm15 + movdqa %xmm0, 0(%rsp) + movdqa %xmm1, 16(%rsp) + movdqa %xmm2, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm8, 128+0(%rsp) + movdqa %xmm9, 128+16(%rsp) + movdqa %xmm10, 128+32(%rsp) + movdqa %xmm11, 128+48(%rsp) + movdqa %xmm12, 256+0(%rsp) + movdqa %xmm13, 256+16(%rsp) + movdqa %xmm14, 256+32(%rsp) + movdqa %xmm15, 256+48(%rsp) + + pxor 64(%rsi, %rbp), %xmm0 + pxor 80(%rsi, %rbp), %xmm1 + pxor 96(%rsi, %rbp), %xmm2 + pxor 112(%rsi, %rbp), %xmm3 + pxor 64(%rsi, %rbx), %xmm8 + pxor 80(%rsi, %rbx), %xmm9 + pxor 96(%rsi, %rbx), %xmm10 + pxor 112(%rsi, %rbx), %xmm11 + pxor 64(%rsi, %rax), %xmm12 + pxor 80(%rsi, %rax), %xmm13 + pxor 96(%rsi, %rax), %xmm14 + pxor 112(%rsi, %rax), %xmm15 + pxor 64(%rsp), %xmm0 + pxor 80(%rsp), %xmm1 + pxor 96(%rsp), %xmm2 + pxor 112(%rsp), %xmm3 + pxor 128+64(%rsp), %xmm8 + pxor 128+80(%rsp), %xmm9 + pxor 128+96(%rsp), %xmm10 + pxor 128+112(%rsp), %xmm11 + pxor 256+64(%rsp), %xmm12 + pxor 256+80(%rsp), %xmm13 + pxor 256+96(%rsp), %xmm14 + pxor 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + salsa8_core_3way_xmm + paddd 64(%rsp), %xmm0 + paddd 80(%rsp), %xmm1 + paddd 96(%rsp), %xmm2 + paddd 112(%rsp), %xmm3 + paddd 128+64(%rsp), %xmm8 + paddd 128+80(%rsp), %xmm9 + paddd 128+96(%rsp), %xmm10 + paddd 128+112(%rsp), %xmm11 + paddd 256+64(%rsp), %xmm12 + paddd 256+80(%rsp), %xmm13 + paddd 256+96(%rsp), %xmm14 + paddd 256+112(%rsp), %xmm15 + movdqa %xmm0, 64(%rsp) + movdqa %xmm1, 80(%rsp) + movdqa %xmm2, 96(%rsp) + movdqa %xmm3, 112(%rsp) + movdqa %xmm8, 128+64(%rsp) + movdqa %xmm9, 128+80(%rsp) + movdqa %xmm10, 128+96(%rsp) + movdqa %xmm11, 128+112(%rsp) + movdqa %xmm12, 256+64(%rsp) + movdqa %xmm13, 256+80(%rsp) + movdqa %xmm14, 256+96(%rsp) + movdqa %xmm15, 256+112(%rsp) + + subq $1, %rcx + ja scrypt_core_3way_xmm_loop2 + + scrypt_shuffle %rsp, 0, %rdi, 0 + scrypt_shuffle %rsp, 64, %rdi, 64 + scrypt_shuffle %rsp, 128, %rdi, 128 + scrypt_shuffle %rsp, 192, %rdi, 192 + scrypt_shuffle %rsp, 256, %rdi, 256 + scrypt_shuffle %rsp, 320, %rdi, 320 + + scrypt_core_3way_cleanup + ret + + +#if defined(USE_AVX2) + +.macro salsa8_core_6way_avx2_doubleround + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 +.endm + +.macro salsa8_core_6way_avx2 + salsa8_core_6way_avx2_doubleround + salsa8_core_6way_avx2_doubleround + salsa8_core_6way_avx2_doubleround + salsa8_core_6way_avx2_doubleround +.endm + + .text + .p2align 6 + .globl scrypt_core_6way + .globl _scrypt_core_6way +scrypt_core_6way: +_scrypt_core_6way: + pushq %rbx + pushq %rbp +#if defined(_WIN64) || defined(__CYGWIN__) + subq $176, %rsp + vmovdqa %xmm6, 8(%rsp) + vmovdqa %xmm7, 24(%rsp) + vmovdqa %xmm8, 40(%rsp) + vmovdqa %xmm9, 56(%rsp) + vmovdqa %xmm10, 72(%rsp) + vmovdqa %xmm11, 88(%rsp) + vmovdqa %xmm12, 104(%rsp) + vmovdqa %xmm13, 120(%rsp) + vmovdqa %xmm14, 136(%rsp) + vmovdqa %xmm15, 152(%rsp) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + movq %rsp, %rdx + subq $768, %rsp + andq $-128, %rsp + +.macro scrypt_core_6way_cleanup + movq %rdx, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + popq %rdi + vmovdqa 8(%rsp), %xmm6 + vmovdqa 24(%rsp), %xmm7 + vmovdqa 40(%rsp), %xmm8 + vmovdqa 56(%rsp), %xmm9 + vmovdqa 72(%rsp), %xmm10 + vmovdqa 88(%rsp), %xmm11 + vmovdqa 104(%rsp), %xmm12 + vmovdqa 120(%rsp), %xmm13 + vmovdqa 136(%rsp), %xmm14 + vmovdqa 152(%rsp), %xmm15 + addq $176, %rsp +#endif + popq %rbp + popq %rbx +.endm + +.macro scrypt_shuffle_pack2 src, so, dest, do + vmovdqa \so+0*16(\src), %xmm0 + vmovdqa \so+1*16(\src), %xmm1 + vmovdqa \so+2*16(\src), %xmm2 + vmovdqa \so+3*16(\src), %xmm3 + vinserti128 $1, \so+128+0*16(\src), %ymm0, %ymm0 + vinserti128 $1, \so+128+1*16(\src), %ymm1, %ymm1 + vinserti128 $1, \so+128+2*16(\src), %ymm2, %ymm2 + vinserti128 $1, \so+128+3*16(\src), %ymm3, %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %ymm0, \do+0*32(\dest) + vmovdqa %ymm1, \do+1*32(\dest) + vmovdqa %ymm2, \do+2*32(\dest) + vmovdqa %ymm3, \do+3*32(\dest) +.endm + +.macro scrypt_shuffle_unpack2 src, so, dest, do + vmovdqa \so+0*32(\src), %ymm0 + vmovdqa \so+1*32(\src), %ymm1 + vmovdqa \so+2*32(\src), %ymm2 + vmovdqa \so+3*32(\src), %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %xmm0, \do+0*16(\dest) + vmovdqa %xmm1, \do+1*16(\dest) + vmovdqa %xmm2, \do+2*16(\dest) + vmovdqa %xmm3, \do+3*16(\dest) + vextracti128 $1, %ymm0, \do+128+0*16(\dest) + vextracti128 $1, %ymm1, \do+128+1*16(\dest) + vextracti128 $1, %ymm2, \do+128+2*16(\dest) + vextracti128 $1, %ymm3, \do+128+3*16(\dest) +.endm + +scrypt_core_6way_avx2: + scrypt_shuffle_pack2 %rdi, 0*256+0, %rsp, 0*128 + scrypt_shuffle_pack2 %rdi, 0*256+64, %rsp, 1*128 + scrypt_shuffle_pack2 %rdi, 1*256+0, %rsp, 2*128 + scrypt_shuffle_pack2 %rdi, 1*256+64, %rsp, 3*128 + scrypt_shuffle_pack2 %rdi, 2*256+0, %rsp, 4*128 + scrypt_shuffle_pack2 %rdi, 2*256+64, %rsp, 5*128 + + vmovdqa 0*256+4*32(%rsp), %ymm0 + vmovdqa 0*256+5*32(%rsp), %ymm1 + vmovdqa 0*256+6*32(%rsp), %ymm2 + vmovdqa 0*256+7*32(%rsp), %ymm3 + vmovdqa 1*256+4*32(%rsp), %ymm8 + vmovdqa 1*256+5*32(%rsp), %ymm9 + vmovdqa 1*256+6*32(%rsp), %ymm10 + vmovdqa 1*256+7*32(%rsp), %ymm11 + vmovdqa 2*256+4*32(%rsp), %ymm12 + vmovdqa 2*256+5*32(%rsp), %ymm13 + vmovdqa 2*256+6*32(%rsp), %ymm14 + vmovdqa 2*256+7*32(%rsp), %ymm15 + + movq %rsi, %rbx + leaq 6*131072(%rsi), %rax +scrypt_core_6way_avx2_loop1: + vmovdqa %ymm0, 0*256+4*32(%rbx) + vmovdqa %ymm1, 0*256+5*32(%rbx) + vmovdqa %ymm2, 0*256+6*32(%rbx) + vmovdqa %ymm3, 0*256+7*32(%rbx) + vpxor 0*256+0*32(%rsp), %ymm0, %ymm0 + vpxor 0*256+1*32(%rsp), %ymm1, %ymm1 + vpxor 0*256+2*32(%rsp), %ymm2, %ymm2 + vpxor 0*256+3*32(%rsp), %ymm3, %ymm3 + vmovdqa %ymm8, 1*256+4*32(%rbx) + vmovdqa %ymm9, 1*256+5*32(%rbx) + vmovdqa %ymm10, 1*256+6*32(%rbx) + vmovdqa %ymm11, 1*256+7*32(%rbx) + vpxor 1*256+0*32(%rsp), %ymm8, %ymm8 + vpxor 1*256+1*32(%rsp), %ymm9, %ymm9 + vpxor 1*256+2*32(%rsp), %ymm10, %ymm10 + vpxor 1*256+3*32(%rsp), %ymm11, %ymm11 + vmovdqa %ymm12, 2*256+4*32(%rbx) + vmovdqa %ymm13, 2*256+5*32(%rbx) + vmovdqa %ymm14, 2*256+6*32(%rbx) + vmovdqa %ymm15, 2*256+7*32(%rbx) + vpxor 2*256+0*32(%rsp), %ymm12, %ymm12 + vpxor 2*256+1*32(%rsp), %ymm13, %ymm13 + vpxor 2*256+2*32(%rsp), %ymm14, %ymm14 + vpxor 2*256+3*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+0*32(%rbx) + vmovdqa %ymm1, 0*256+1*32(%rbx) + vmovdqa %ymm2, 0*256+2*32(%rbx) + vmovdqa %ymm3, 0*256+3*32(%rbx) + vmovdqa %ymm8, 1*256+0*32(%rbx) + vmovdqa %ymm9, 1*256+1*32(%rbx) + vmovdqa %ymm10, 1*256+2*32(%rbx) + vmovdqa %ymm11, 1*256+3*32(%rbx) + vmovdqa %ymm12, 2*256+0*32(%rbx) + vmovdqa %ymm13, 2*256+1*32(%rbx) + vmovdqa %ymm14, 2*256+2*32(%rbx) + vmovdqa %ymm15, 2*256+3*32(%rbx) + + salsa8_core_6way_avx2 + vpaddd 0*256+0*32(%rbx), %ymm0, %ymm0 + vpaddd 0*256+1*32(%rbx), %ymm1, %ymm1 + vpaddd 0*256+2*32(%rbx), %ymm2, %ymm2 + vpaddd 0*256+3*32(%rbx), %ymm3, %ymm3 + vpaddd 1*256+0*32(%rbx), %ymm8, %ymm8 + vpaddd 1*256+1*32(%rbx), %ymm9, %ymm9 + vpaddd 1*256+2*32(%rbx), %ymm10, %ymm10 + vpaddd 1*256+3*32(%rbx), %ymm11, %ymm11 + vpaddd 2*256+0*32(%rbx), %ymm12, %ymm12 + vpaddd 2*256+1*32(%rbx), %ymm13, %ymm13 + vpaddd 2*256+2*32(%rbx), %ymm14, %ymm14 + vpaddd 2*256+3*32(%rbx), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+0*32(%rsp) + vmovdqa %ymm1, 0*256+1*32(%rsp) + vmovdqa %ymm2, 0*256+2*32(%rsp) + vmovdqa %ymm3, 0*256+3*32(%rsp) + vmovdqa %ymm8, 1*256+0*32(%rsp) + vmovdqa %ymm9, 1*256+1*32(%rsp) + vmovdqa %ymm10, 1*256+2*32(%rsp) + vmovdqa %ymm11, 1*256+3*32(%rsp) + vmovdqa %ymm12, 2*256+0*32(%rsp) + vmovdqa %ymm13, 2*256+1*32(%rsp) + vmovdqa %ymm14, 2*256+2*32(%rsp) + vmovdqa %ymm15, 2*256+3*32(%rsp) + + vpxor 0*256+4*32(%rbx), %ymm0, %ymm0 + vpxor 0*256+5*32(%rbx), %ymm1, %ymm1 + vpxor 0*256+6*32(%rbx), %ymm2, %ymm2 + vpxor 0*256+7*32(%rbx), %ymm3, %ymm3 + vpxor 1*256+4*32(%rbx), %ymm8, %ymm8 + vpxor 1*256+5*32(%rbx), %ymm9, %ymm9 + vpxor 1*256+6*32(%rbx), %ymm10, %ymm10 + vpxor 1*256+7*32(%rbx), %ymm11, %ymm11 + vpxor 2*256+4*32(%rbx), %ymm12, %ymm12 + vpxor 2*256+5*32(%rbx), %ymm13, %ymm13 + vpxor 2*256+6*32(%rbx), %ymm14, %ymm14 + vpxor 2*256+7*32(%rbx), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + salsa8_core_6way_avx2 + vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0 + vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1 + vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2 + vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3 + vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8 + vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9 + vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10 + vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11 + vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12 + vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13 + vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14 + vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15 + + addq $6*128, %rbx + cmpq %rax, %rbx + jne scrypt_core_6way_avx2_loop1 + + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + + movq $1024, %rcx +scrypt_core_6way_avx2_loop2: + vmovd %xmm0, %ebp + vmovd %xmm8, %ebx + vmovd %xmm12, %eax + vextracti128 $1, %ymm0, %xmm4 + vextracti128 $1, %ymm8, %xmm5 + vextracti128 $1, %ymm12, %xmm6 + vmovd %xmm4, %r8d + vmovd %xmm5, %r9d + vmovd %xmm6, %r10d + vpxor 0*256+0*32(%rsp), %ymm0, %ymm0 + vpxor 0*256+1*32(%rsp), %ymm1, %ymm1 + vpxor 0*256+2*32(%rsp), %ymm2, %ymm2 + vpxor 0*256+3*32(%rsp), %ymm3, %ymm3 + vpxor 1*256+0*32(%rsp), %ymm8, %ymm8 + vpxor 1*256+1*32(%rsp), %ymm9, %ymm9 + vpxor 1*256+2*32(%rsp), %ymm10, %ymm10 + vpxor 1*256+3*32(%rsp), %ymm11, %ymm11 + vpxor 2*256+0*32(%rsp), %ymm12, %ymm12 + vpxor 2*256+1*32(%rsp), %ymm13, %ymm13 + vpxor 2*256+2*32(%rsp), %ymm14, %ymm14 + vpxor 2*256+3*32(%rsp), %ymm15, %ymm15 + andl $1023, %ebp + leaq 0(%rbp, %rbp, 2), %rbp + shll $8, %ebp + andl $1023, %ebx + leaq 1(%rbx, %rbx, 2), %rbx + shll $8, %ebx + andl $1023, %eax + leaq 2(%rax, %rax, 2), %rax + shll $8, %eax + andl $1023, %r8d + leaq 0(%r8, %r8, 2), %r8 + shll $8, %r8d + andl $1023, %r9d + leaq 1(%r9, %r9, 2), %r9 + shll $8, %r9d + andl $1023, %r10d + leaq 2(%r10, %r10, 2), %r10 + shll $8, %r10d + vmovdqa 0*32(%rsi, %rbp), %xmm4 + vinserti128 $1, 0*32+16(%rsi, %r8), %ymm4, %ymm4 + vmovdqa 1*32(%rsi, %rbp), %xmm5 + vinserti128 $1, 1*32+16(%rsi, %r8), %ymm5, %ymm5 + vmovdqa 2*32(%rsi, %rbp), %xmm6 + vinserti128 $1, 2*32+16(%rsi, %r8), %ymm6, %ymm6 + vmovdqa 3*32(%rsi, %rbp), %xmm7 + vinserti128 $1, 3*32+16(%rsi, %r8), %ymm7, %ymm7 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqa 0*32(%rsi, %rbx), %xmm4 + vinserti128 $1, 0*32+16(%rsi, %r9), %ymm4, %ymm4 + vmovdqa 1*32(%rsi, %rbx), %xmm5 + vinserti128 $1, 1*32+16(%rsi, %r9), %ymm5, %ymm5 + vmovdqa 2*32(%rsi, %rbx), %xmm6 + vinserti128 $1, 2*32+16(%rsi, %r9), %ymm6, %ymm6 + vmovdqa 3*32(%rsi, %rbx), %xmm7 + vinserti128 $1, 3*32+16(%rsi, %r9), %ymm7, %ymm7 + vpxor %ymm4, %ymm8, %ymm8 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm10, %ymm10 + vpxor %ymm7, %ymm11, %ymm11 + vmovdqa 0*32(%rsi, %rax), %xmm4 + vinserti128 $1, 0*32+16(%rsi, %r10), %ymm4, %ymm4 + vmovdqa 1*32(%rsi, %rax), %xmm5 + vinserti128 $1, 1*32+16(%rsi, %r10), %ymm5, %ymm5 + vmovdqa 2*32(%rsi, %rax), %xmm6 + vinserti128 $1, 2*32+16(%rsi, %r10), %ymm6, %ymm6 + vmovdqa 3*32(%rsi, %rax), %xmm7 + vinserti128 $1, 3*32+16(%rsi, %r10), %ymm7, %ymm7 + vpxor %ymm4, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm6, %ymm14, %ymm14 + vpxor %ymm7, %ymm15, %ymm15 + + vmovdqa %ymm0, 0*256+0*32(%rsp) + vmovdqa %ymm1, 0*256+1*32(%rsp) + vmovdqa %ymm2, 0*256+2*32(%rsp) + vmovdqa %ymm3, 0*256+3*32(%rsp) + vmovdqa %ymm8, 1*256+0*32(%rsp) + vmovdqa %ymm9, 1*256+1*32(%rsp) + vmovdqa %ymm10, 1*256+2*32(%rsp) + vmovdqa %ymm11, 1*256+3*32(%rsp) + vmovdqa %ymm12, 2*256+0*32(%rsp) + vmovdqa %ymm13, 2*256+1*32(%rsp) + vmovdqa %ymm14, 2*256+2*32(%rsp) + vmovdqa %ymm15, 2*256+3*32(%rsp) + salsa8_core_6way_avx2 + vpaddd 0*256+0*32(%rsp), %ymm0, %ymm0 + vpaddd 0*256+1*32(%rsp), %ymm1, %ymm1 + vpaddd 0*256+2*32(%rsp), %ymm2, %ymm2 + vpaddd 0*256+3*32(%rsp), %ymm3, %ymm3 + vpaddd 1*256+0*32(%rsp), %ymm8, %ymm8 + vpaddd 1*256+1*32(%rsp), %ymm9, %ymm9 + vpaddd 1*256+2*32(%rsp), %ymm10, %ymm10 + vpaddd 1*256+3*32(%rsp), %ymm11, %ymm11 + vpaddd 2*256+0*32(%rsp), %ymm12, %ymm12 + vpaddd 2*256+1*32(%rsp), %ymm13, %ymm13 + vpaddd 2*256+2*32(%rsp), %ymm14, %ymm14 + vpaddd 2*256+3*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+0*32(%rsp) + vmovdqa %ymm1, 0*256+1*32(%rsp) + vmovdqa %ymm2, 0*256+2*32(%rsp) + vmovdqa %ymm3, 0*256+3*32(%rsp) + vmovdqa %ymm8, 1*256+0*32(%rsp) + vmovdqa %ymm9, 1*256+1*32(%rsp) + vmovdqa %ymm10, 1*256+2*32(%rsp) + vmovdqa %ymm11, 1*256+3*32(%rsp) + vmovdqa %ymm12, 2*256+0*32(%rsp) + vmovdqa %ymm13, 2*256+1*32(%rsp) + vmovdqa %ymm14, 2*256+2*32(%rsp) + vmovdqa %ymm15, 2*256+3*32(%rsp) + + vmovdqa 4*32(%rsi, %rbp), %xmm4 + vinserti128 $1, 4*32+16(%rsi, %r8), %ymm4, %ymm4 + vmovdqa 5*32(%rsi, %rbp), %xmm5 + vinserti128 $1, 5*32+16(%rsi, %r8), %ymm5, %ymm5 + vmovdqa 6*32(%rsi, %rbp), %xmm6 + vinserti128 $1, 6*32+16(%rsi, %r8), %ymm6, %ymm6 + vmovdqa 7*32(%rsi, %rbp), %xmm7 + vinserti128 $1, 7*32+16(%rsi, %r8), %ymm7, %ymm7 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqa 4*32(%rsi, %rbx), %xmm4 + vinserti128 $1, 4*32+16(%rsi, %r9), %ymm4, %ymm4 + vmovdqa 5*32(%rsi, %rbx), %xmm5 + vinserti128 $1, 5*32+16(%rsi, %r9), %ymm5, %ymm5 + vmovdqa 6*32(%rsi, %rbx), %xmm6 + vinserti128 $1, 6*32+16(%rsi, %r9), %ymm6, %ymm6 + vmovdqa 7*32(%rsi, %rbx), %xmm7 + vinserti128 $1, 7*32+16(%rsi, %r9), %ymm7, %ymm7 + vpxor %ymm4, %ymm8, %ymm8 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm10, %ymm10 + vpxor %ymm7, %ymm11, %ymm11 + vmovdqa 4*32(%rsi, %rax), %xmm4 + vinserti128 $1, 4*32+16(%rsi, %r10), %ymm4, %ymm4 + vmovdqa 5*32(%rsi, %rax), %xmm5 + vinserti128 $1, 5*32+16(%rsi, %r10), %ymm5, %ymm5 + vmovdqa 6*32(%rsi, %rax), %xmm6 + vinserti128 $1, 6*32+16(%rsi, %r10), %ymm6, %ymm6 + vmovdqa 7*32(%rsi, %rax), %xmm7 + vinserti128 $1, 7*32+16(%rsi, %r10), %ymm7, %ymm7 + vpxor %ymm4, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm6, %ymm14, %ymm14 + vpxor %ymm7, %ymm15, %ymm15 + vpxor 0*256+4*32(%rsp), %ymm0, %ymm0 + vpxor 0*256+5*32(%rsp), %ymm1, %ymm1 + vpxor 0*256+6*32(%rsp), %ymm2, %ymm2 + vpxor 0*256+7*32(%rsp), %ymm3, %ymm3 + vpxor 1*256+4*32(%rsp), %ymm8, %ymm8 + vpxor 1*256+5*32(%rsp), %ymm9, %ymm9 + vpxor 1*256+6*32(%rsp), %ymm10, %ymm10 + vpxor 1*256+7*32(%rsp), %ymm11, %ymm11 + vpxor 2*256+4*32(%rsp), %ymm12, %ymm12 + vpxor 2*256+5*32(%rsp), %ymm13, %ymm13 + vpxor 2*256+6*32(%rsp), %ymm14, %ymm14 + vpxor 2*256+7*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + salsa8_core_6way_avx2 + vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0 + vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1 + vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2 + vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3 + vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8 + vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9 + vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10 + vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11 + vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12 + vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13 + vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14 + vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + + subq $1, %rcx + ja scrypt_core_6way_avx2_loop2 + + scrypt_shuffle_unpack2 %rsp, 0*128, %rdi, 0*256+0 + scrypt_shuffle_unpack2 %rsp, 1*128, %rdi, 0*256+64 + scrypt_shuffle_unpack2 %rsp, 2*128, %rdi, 1*256+0 + scrypt_shuffle_unpack2 %rsp, 3*128, %rdi, 1*256+64 + scrypt_shuffle_unpack2 %rsp, 4*128, %rdi, 2*256+0 + scrypt_shuffle_unpack2 %rsp, 5*128, %rdi, 2*256+64 + + scrypt_core_6way_cleanup + ret + +#endif /* USE_AVX2 */ + +#endif diff --git a/scrypt-x86.S b/scrypt-x86.S new file mode 100644 index 00000000..4fb2c466 --- /dev/null +++ b/scrypt-x86.S @@ -0,0 +1,821 @@ +/* + * Copyright 2011-2012 pooler@litecoinpool.org + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "cpuminer-config.h" + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + +#if defined(__i386__) + +.macro scrypt_shuffle src, so, dest, do + movl \so+60(\src), %eax + movl \so+44(\src), %ebx + movl \so+28(\src), %ecx + movl \so+12(\src), %edx + movl %eax, \do+12(\dest) + movl %ebx, \do+28(\dest) + movl %ecx, \do+44(\dest) + movl %edx, \do+60(\dest) + movl \so+40(\src), %eax + movl \so+8(\src), %ebx + movl \so+48(\src), %ecx + movl \so+16(\src), %edx + movl %eax, \do+8(\dest) + movl %ebx, \do+40(\dest) + movl %ecx, \do+16(\dest) + movl %edx, \do+48(\dest) + movl \so+20(\src), %eax + movl \so+4(\src), %ebx + movl \so+52(\src), %ecx + movl \so+36(\src), %edx + movl %eax, \do+4(\dest) + movl %ebx, \do+20(\dest) + movl %ecx, \do+36(\dest) + movl %edx, \do+52(\dest) + movl \so+0(\src), %eax + movl \so+24(\src), %ebx + movl \so+32(\src), %ecx + movl \so+56(\src), %edx + movl %eax, \do+0(\dest) + movl %ebx, \do+24(\dest) + movl %ecx, \do+32(\dest) + movl %edx, \do+56(\dest) +.endm + +.macro salsa8_core_gen_quadround + movl 52(%esp), %ecx + movl 4(%esp), %edx + movl 20(%esp), %ebx + movl 8(%esp), %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 4(%esp) + movl 36(%esp), %edi + leal (%edx, %ebx), %ebp + roll $9, %ebp + xorl %ebp, %edi + movl 24(%esp), %ebp + movl %edi, 8(%esp) + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 40(%esp), %ebx + movl %ecx, 20(%esp) + addl %edi, %ecx + roll $18, %ecx + leal (%esi, %ebp), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 24(%esp) + movl 56(%esp), %edi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %edi + movl %edi, 36(%esp) + movl 28(%esp), %ecx + movl %edx, 28(%esp) + movl 44(%esp), %edx + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %esi + movl 60(%esp), %ebx + movl %esi, 40(%esp) + addl %edi, %esi + roll $18, %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 44(%esp) + movl 12(%esp), %edi + xorl %esi, %ebp + leal (%edx, %ebx), %esi + roll $9, %esi + xorl %esi, %edi + movl %edi, 12(%esp) + movl 48(%esp), %esi + movl %ebp, 48(%esp) + movl 64(%esp), %ebp + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 16(%esp), %ebx + movl %ecx, 16(%esp) + addl %edi, %ecx + roll $18, %ecx + leal (%esi, %ebp), %edi + roll $7, %edi + xorl %edi, %ebx + movl 32(%esp), %edi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %edi + movl %edi, 32(%esp) + movl %ebx, %ecx + movl %edx, 52(%esp) + movl 28(%esp), %edx + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %esi + movl 40(%esp), %ebx + movl %esi, 28(%esp) + addl %edi, %esi + roll $18, %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 40(%esp) + movl 12(%esp), %edi + xorl %esi, %ebp + leal (%edx, %ebx), %esi + roll $9, %esi + xorl %esi, %edi + movl %edi, 12(%esp) + movl 4(%esp), %esi + movl %ebp, 4(%esp) + movl 48(%esp), %ebp + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 16(%esp), %ebx + movl %ecx, 16(%esp) + addl %edi, %ecx + roll $18, %ecx + leal (%esi, %ebp), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 48(%esp) + movl 32(%esp), %edi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %edi + movl %edi, 32(%esp) + movl 24(%esp), %ecx + movl %edx, 24(%esp) + movl 52(%esp), %edx + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %esi + movl 28(%esp), %ebx + movl %esi, 28(%esp) + addl %edi, %esi + roll $18, %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 52(%esp) + movl 8(%esp), %edi + xorl %esi, %ebp + leal (%edx, %ebx), %esi + roll $9, %esi + xorl %esi, %edi + movl %edi, 8(%esp) + movl 44(%esp), %esi + movl %ebp, 44(%esp) + movl 4(%esp), %ebp + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 20(%esp), %ebx + movl %ecx, 4(%esp) + addl %edi, %ecx + roll $18, %ecx + leal (%esi, %ebp), %edi + roll $7, %edi + xorl %edi, %ebx + movl 36(%esp), %edi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %edi + movl %edi, 20(%esp) + movl %ebx, %ecx + movl %edx, 36(%esp) + movl 24(%esp), %edx + addl %edi, %ebx + roll $13, %ebx + xorl %ebx, %esi + movl 28(%esp), %ebx + movl %esi, 24(%esp) + addl %edi, %esi + roll $18, %esi + leal (%ecx, %edx), %edi + roll $7, %edi + xorl %edi, %ebx + movl %ebx, 28(%esp) + xorl %esi, %ebp + movl 8(%esp), %esi + leal (%edx, %ebx), %edi + roll $9, %edi + xorl %edi, %esi + movl 40(%esp), %edi + movl %ebp, 8(%esp) + movl 44(%esp), %ebp + movl %esi, 40(%esp) + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 4(%esp), %ebx + movl %ecx, 44(%esp) + addl %esi, %ecx + roll $18, %ecx + leal (%edi, %ebp), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 4(%esp) + movl 20(%esp), %esi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %esi + movl %esi, 56(%esp) + movl 48(%esp), %ecx + movl %edx, 20(%esp) + movl 36(%esp), %edx + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %edi + movl 24(%esp), %ebx + movl %edi, 24(%esp) + addl %esi, %edi + roll $18, %edi + leal (%ecx, %edx), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 60(%esp) + movl 12(%esp), %esi + xorl %edi, %ebp + leal (%edx, %ebx), %edi + roll $9, %edi + xorl %edi, %esi + movl %esi, 12(%esp) + movl 52(%esp), %edi + movl %ebp, 36(%esp) + movl 8(%esp), %ebp + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 16(%esp), %ebx + movl %ecx, 16(%esp) + addl %esi, %ecx + roll $18, %ecx + leal (%edi, %ebp), %esi + roll $7, %esi + xorl %esi, %ebx + movl 32(%esp), %esi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %esi + movl %esi, 32(%esp) + movl %ebx, %ecx + movl %edx, 48(%esp) + movl 20(%esp), %edx + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %edi + movl 24(%esp), %ebx + movl %edi, 20(%esp) + addl %esi, %edi + roll $18, %edi + leal (%ecx, %edx), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 8(%esp) + movl 12(%esp), %esi + xorl %edi, %ebp + leal (%edx, %ebx), %edi + roll $9, %edi + xorl %edi, %esi + movl %esi, 12(%esp) + movl 28(%esp), %edi + movl %ebp, 52(%esp) + movl 36(%esp), %ebp + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 16(%esp), %ebx + movl %ecx, 16(%esp) + addl %esi, %ecx + roll $18, %ecx + leal (%edi, %ebp), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 28(%esp) + movl 32(%esp), %esi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %esi + movl %esi, 32(%esp) + movl 4(%esp), %ecx + movl %edx, 4(%esp) + movl 48(%esp), %edx + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %edi + movl 20(%esp), %ebx + movl %edi, 20(%esp) + addl %esi, %edi + roll $18, %edi + leal (%ecx, %edx), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 48(%esp) + movl 40(%esp), %esi + xorl %edi, %ebp + leal (%edx, %ebx), %edi + roll $9, %edi + xorl %edi, %esi + movl %esi, 36(%esp) + movl 60(%esp), %edi + movl %ebp, 24(%esp) + movl 52(%esp), %ebp + addl %esi, %ebx + roll $13, %ebx + xorl %ebx, %ecx + movl 44(%esp), %ebx + movl %ecx, 40(%esp) + addl %esi, %ecx + roll $18, %ecx + leal (%edi, %ebp), %esi + roll $7, %esi + xorl %esi, %ebx + movl %ebx, 52(%esp) + movl 56(%esp), %esi + xorl %ecx, %edx + leal (%ebp, %ebx), %ecx + roll $9, %ecx + xorl %ecx, %esi + movl %esi, 56(%esp) + addl %esi, %ebx + movl %edx, 44(%esp) + roll $13, %ebx + xorl %ebx, %edi + movl %edi, 60(%esp) + addl %esi, %edi + roll $18, %edi + xorl %edi, %ebp + movl %ebp, 64(%esp) +.endm + + .text + .p2align 5 +salsa8_core_gen: + salsa8_core_gen_quadround + salsa8_core_gen_quadround + ret + + + .text + .p2align 5 + .globl scrypt_core + .globl _scrypt_core +scrypt_core: +_scrypt_core: + pushl %ebx + pushl %ebp + pushl %edi + pushl %esi + + /* Check for SSE2 availability */ + movl $1, %eax + cpuid + andl $0x04000000, %edx + jnz scrypt_core_sse2 + +scrypt_core_gen: + movl 20(%esp), %edi + movl 24(%esp), %esi + subl $72, %esp + +.macro scrypt_core_macro1a p, q + movl \p(%edi), %eax + movl \q(%edi), %edx + movl %eax, \p(%esi) + movl %edx, \q(%esi) + xorl %edx, %eax + movl %eax, \p(%edi) + movl %eax, \p(%esp) +.endm + +.macro scrypt_core_macro1b p, q + movl \p(%edi), %eax + xorl \p(%esi, %edx), %eax + movl \q(%edi), %ebx + xorl \q(%esi, %edx), %ebx + movl %ebx, \q(%edi) + xorl %ebx, %eax + movl %eax, \p(%edi) + movl %eax, \p(%esp) +.endm + +.macro scrypt_core_macro2 p, q + movl \p(%esp), %eax + addl \p(%edi), %eax + movl %eax, \p(%edi) + xorl \q(%edi), %eax + movl %eax, \q(%edi) + movl %eax, \p(%esp) +.endm + +.macro scrypt_core_macro3 p, q + movl \p(%esp), %eax + addl \q(%edi), %eax + movl %eax, \q(%edi) +.endm + + leal 131072(%esi), %ecx +scrypt_core_gen_loop1: + movl %esi, 64(%esp) + movl %ecx, 68(%esp) + + scrypt_core_macro1a 0, 64 + scrypt_core_macro1a 4, 68 + scrypt_core_macro1a 8, 72 + scrypt_core_macro1a 12, 76 + scrypt_core_macro1a 16, 80 + scrypt_core_macro1a 20, 84 + scrypt_core_macro1a 24, 88 + scrypt_core_macro1a 28, 92 + scrypt_core_macro1a 32, 96 + scrypt_core_macro1a 36, 100 + scrypt_core_macro1a 40, 104 + scrypt_core_macro1a 44, 108 + scrypt_core_macro1a 48, 112 + scrypt_core_macro1a 52, 116 + scrypt_core_macro1a 56, 120 + scrypt_core_macro1a 60, 124 + + call salsa8_core_gen + + movl 92(%esp), %edi + scrypt_core_macro2 0, 64 + scrypt_core_macro2 4, 68 + scrypt_core_macro2 8, 72 + scrypt_core_macro2 12, 76 + scrypt_core_macro2 16, 80 + scrypt_core_macro2 20, 84 + scrypt_core_macro2 24, 88 + scrypt_core_macro2 28, 92 + scrypt_core_macro2 32, 96 + scrypt_core_macro2 36, 100 + scrypt_core_macro2 40, 104 + scrypt_core_macro2 44, 108 + scrypt_core_macro2 48, 112 + scrypt_core_macro2 52, 116 + scrypt_core_macro2 56, 120 + scrypt_core_macro2 60, 124 + + call salsa8_core_gen + + movl 92(%esp), %edi + scrypt_core_macro3 0, 64 + scrypt_core_macro3 4, 68 + scrypt_core_macro3 8, 72 + scrypt_core_macro3 12, 76 + scrypt_core_macro3 16, 80 + scrypt_core_macro3 20, 84 + scrypt_core_macro3 24, 88 + scrypt_core_macro3 28, 92 + scrypt_core_macro3 32, 96 + scrypt_core_macro3 36, 100 + scrypt_core_macro3 40, 104 + scrypt_core_macro3 44, 108 + scrypt_core_macro3 48, 112 + scrypt_core_macro3 52, 116 + scrypt_core_macro3 56, 120 + scrypt_core_macro3 60, 124 + + movl 64(%esp), %esi + movl 68(%esp), %ecx + addl $128, %esi + cmpl %ecx, %esi + jne scrypt_core_gen_loop1 + + movl 96(%esp), %esi + movl $1024, %ecx +scrypt_core_gen_loop2: + movl %ecx, 68(%esp) + + movl 64(%edi), %edx + andl $1023, %edx + shll $7, %edx + + scrypt_core_macro1b 0, 64 + scrypt_core_macro1b 4, 68 + scrypt_core_macro1b 8, 72 + scrypt_core_macro1b 12, 76 + scrypt_core_macro1b 16, 80 + scrypt_core_macro1b 20, 84 + scrypt_core_macro1b 24, 88 + scrypt_core_macro1b 28, 92 + scrypt_core_macro1b 32, 96 + scrypt_core_macro1b 36, 100 + scrypt_core_macro1b 40, 104 + scrypt_core_macro1b 44, 108 + scrypt_core_macro1b 48, 112 + scrypt_core_macro1b 52, 116 + scrypt_core_macro1b 56, 120 + scrypt_core_macro1b 60, 124 + + call salsa8_core_gen + + movl 92(%esp), %edi + scrypt_core_macro2 0, 64 + scrypt_core_macro2 4, 68 + scrypt_core_macro2 8, 72 + scrypt_core_macro2 12, 76 + scrypt_core_macro2 16, 80 + scrypt_core_macro2 20, 84 + scrypt_core_macro2 24, 88 + scrypt_core_macro2 28, 92 + scrypt_core_macro2 32, 96 + scrypt_core_macro2 36, 100 + scrypt_core_macro2 40, 104 + scrypt_core_macro2 44, 108 + scrypt_core_macro2 48, 112 + scrypt_core_macro2 52, 116 + scrypt_core_macro2 56, 120 + scrypt_core_macro2 60, 124 + + call salsa8_core_gen + + movl 92(%esp), %edi + movl 96(%esp), %esi + scrypt_core_macro3 0, 64 + scrypt_core_macro3 4, 68 + scrypt_core_macro3 8, 72 + scrypt_core_macro3 12, 76 + scrypt_core_macro3 16, 80 + scrypt_core_macro3 20, 84 + scrypt_core_macro3 24, 88 + scrypt_core_macro3 28, 92 + scrypt_core_macro3 32, 96 + scrypt_core_macro3 36, 100 + scrypt_core_macro3 40, 104 + scrypt_core_macro3 44, 108 + scrypt_core_macro3 48, 112 + scrypt_core_macro3 52, 116 + scrypt_core_macro3 56, 120 + scrypt_core_macro3 60, 124 + + movl 68(%esp), %ecx + subl $1, %ecx + ja scrypt_core_gen_loop2 + + addl $72, %esp + popl %esi + popl %edi + popl %ebp + popl %ebx + ret + + +.macro salsa8_core_sse2_doubleround + movdqa %xmm1, %xmm4 + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm3 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm3, %xmm3 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm1 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + movdqa %xmm3, %xmm4 + pxor %xmm5, %xmm0 + pshufd $0x39, %xmm1, %xmm1 + + paddd %xmm0, %xmm4 + movdqa %xmm4, %xmm5 + pslld $7, %xmm4 + psrld $25, %xmm5 + pxor %xmm4, %xmm1 + movdqa %xmm0, %xmm4 + pxor %xmm5, %xmm1 + + paddd %xmm1, %xmm4 + movdqa %xmm4, %xmm5 + pslld $9, %xmm4 + psrld $23, %xmm5 + pxor %xmm4, %xmm2 + movdqa %xmm1, %xmm4 + pxor %xmm5, %xmm2 + pshufd $0x93, %xmm1, %xmm1 + + paddd %xmm2, %xmm4 + movdqa %xmm4, %xmm5 + pslld $13, %xmm4 + psrld $19, %xmm5 + pxor %xmm4, %xmm3 + movdqa %xmm2, %xmm4 + pxor %xmm5, %xmm3 + pshufd $0x4e, %xmm2, %xmm2 + + paddd %xmm3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $18, %xmm4 + psrld $14, %xmm5 + pxor %xmm4, %xmm0 + pshufd $0x39, %xmm3, %xmm3 + pxor %xmm5, %xmm0 +.endm + +.macro salsa8_core_sse2 + salsa8_core_sse2_doubleround + salsa8_core_sse2_doubleround + salsa8_core_sse2_doubleround + salsa8_core_sse2_doubleround +.endm + + .p2align 5 +scrypt_core_sse2: + movl 20(%esp), %edi + movl 24(%esp), %esi + movl %esp, %ebp + subl $128, %esp + andl $-16, %esp + + scrypt_shuffle %edi, 0, %esp, 0 + scrypt_shuffle %edi, 64, %esp, 64 + + movdqa 96(%esp), %xmm6 + movdqa 112(%esp), %xmm7 + + movl %esi, %edx + leal 131072(%esi), %ecx +scrypt_core_sse2_loop1: + movdqa 0(%esp), %xmm0 + movdqa 16(%esp), %xmm1 + movdqa 32(%esp), %xmm2 + movdqa 48(%esp), %xmm3 + movdqa 64(%esp), %xmm4 + movdqa 80(%esp), %xmm5 + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + movdqa %xmm0, 0(%edx) + movdqa %xmm1, 16(%edx) + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm2, 32(%edx) + movdqa %xmm3, 48(%edx) + movdqa %xmm4, 64(%edx) + movdqa %xmm5, 80(%edx) + movdqa %xmm6, 96(%edx) + movdqa %xmm7, 112(%edx) + + salsa8_core_sse2 + paddd 0(%edx), %xmm0 + paddd 16(%edx), %xmm1 + paddd 32(%edx), %xmm2 + paddd 48(%edx), %xmm3 + movdqa %xmm0, 0(%esp) + movdqa %xmm1, 16(%esp) + movdqa %xmm2, 32(%esp) + movdqa %xmm3, 48(%esp) + + pxor 64(%esp), %xmm0 + pxor 80(%esp), %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm0, 64(%esp) + movdqa %xmm1, 80(%esp) + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 + salsa8_core_sse2 + paddd 64(%esp), %xmm0 + paddd 80(%esp), %xmm1 + paddd %xmm2, %xmm6 + paddd %xmm3, %xmm7 + movdqa %xmm0, 64(%esp) + movdqa %xmm1, 80(%esp) + + addl $128, %edx + cmpl %ecx, %edx + jne scrypt_core_sse2_loop1 + + movdqa 64(%esp), %xmm4 + movdqa 80(%esp), %xmm5 + + movl $1024, %ecx +scrypt_core_sse2_loop2: + movd %xmm4, %edx + movdqa 0(%esp), %xmm0 + movdqa 16(%esp), %xmm1 + movdqa 32(%esp), %xmm2 + movdqa 48(%esp), %xmm3 + andl $1023, %edx + shll $7, %edx + pxor 0(%esi, %edx), %xmm0 + pxor 16(%esi, %edx), %xmm1 + pxor 32(%esi, %edx), %xmm2 + pxor 48(%esi, %edx), %xmm3 + + pxor %xmm4, %xmm0 + pxor %xmm5, %xmm1 + movdqa %xmm0, 0(%esp) + movdqa %xmm1, 16(%esp) + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm2, 32(%esp) + movdqa %xmm3, 48(%esp) + salsa8_core_sse2 + paddd 0(%esp), %xmm0 + paddd 16(%esp), %xmm1 + paddd 32(%esp), %xmm2 + paddd 48(%esp), %xmm3 + movdqa %xmm0, 0(%esp) + movdqa %xmm1, 16(%esp) + movdqa %xmm2, 32(%esp) + movdqa %xmm3, 48(%esp) + + pxor 64(%esi, %edx), %xmm0 + pxor 80(%esi, %edx), %xmm1 + pxor 96(%esi, %edx), %xmm2 + pxor 112(%esi, %edx), %xmm3 + pxor 64(%esp), %xmm0 + pxor 80(%esp), %xmm1 + pxor %xmm6, %xmm2 + pxor %xmm7, %xmm3 + movdqa %xmm0, 64(%esp) + movdqa %xmm1, 80(%esp) + movdqa %xmm2, %xmm6 + movdqa %xmm3, %xmm7 + salsa8_core_sse2 + paddd 64(%esp), %xmm0 + paddd 80(%esp), %xmm1 + paddd %xmm2, %xmm6 + paddd %xmm3, %xmm7 + movdqa %xmm0, %xmm4 + movdqa %xmm1, %xmm5 + movdqa %xmm0, 64(%esp) + movdqa %xmm1, 80(%esp) + + subl $1, %ecx + ja scrypt_core_sse2_loop2 + + movdqa %xmm6, 96(%esp) + movdqa %xmm7, 112(%esp) + + scrypt_shuffle %esp, 0, %edi, 0 + scrypt_shuffle %esp, 64, %edi, 64 + + movl %ebp, %esp + popl %esi + popl %edi + popl %ebp + popl %ebx + ret + +#endif diff --git a/scrypt.c b/scrypt.c new file mode 100644 index 00000000..f113c79f --- /dev/null +++ b/scrypt.c @@ -0,0 +1,758 @@ +/* + * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * This file was originally written by Colin Percival as part of the Tarsnap + * online backup system. + */ + +#include "cpuminer-config.h" +#include "miner.h" + +#include +#include +#include + +static const uint32_t keypad[12] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000280 +}; +static const uint32_t innerpad[11] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x000004a0 +}; +static const uint32_t outerpad[8] = { + 0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300 +}; +static const uint32_t finalblk[16] = { + 0x00000001, 0x80000000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x00000620 +}; + +static inline void HMAC_SHA256_80_init(const uint32_t *key, + uint32_t *tstate, uint32_t *ostate) +{ + uint32_t ihash[8]; + uint32_t pad[16]; + int i; + + /* tstate is assumed to contain the midstate of key */ + memcpy(pad, key + 16, 16); + memcpy(pad + 4, keypad, 48); + sha256_transform(tstate, pad, 0); + memcpy(ihash, tstate, 32); + + sha256_init(ostate); + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x5c5c5c5c; + for (; i < 16; i++) + pad[i] = 0x5c5c5c5c; + sha256_transform(ostate, pad, 0); + + sha256_init(tstate); + for (i = 0; i < 8; i++) + pad[i] = ihash[i] ^ 0x36363636; + for (; i < 16; i++) + pad[i] = 0x36363636; + sha256_transform(tstate, pad, 0); +} + +static inline void PBKDF2_SHA256_80_128(const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t istate[8], ostate2[8]; + uint32_t ibuf[16], obuf[16]; + int i, j; + + memcpy(istate, tstate, 32); + sha256_transform(istate, salt, 0); + + memcpy(ibuf, salt + 16, 16); + memcpy(ibuf + 5, innerpad, 44); + memcpy(obuf + 8, outerpad, 32); + + for (i = 0; i < 4; i++) { + memcpy(obuf, istate, 32); + ibuf[4] = i + 1; + sha256_transform(obuf, ibuf, 0); + + memcpy(ostate2, ostate, 32); + sha256_transform(ostate2, obuf, 0); + for (j = 0; j < 8; j++) + output[8 * i + j] = swab32(ostate2[j]); + } +} + +static inline void PBKDF2_SHA256_128_32(uint32_t *tstate, uint32_t *ostate, + const uint32_t *salt, uint32_t *output) +{ + uint32_t buf[16]; + int i; + + sha256_transform(tstate, salt, 1); + sha256_transform(tstate, salt + 16, 1); + sha256_transform(tstate, finalblk, 0); + memcpy(buf, tstate, 32); + memcpy(buf + 8, outerpad, 32); + + sha256_transform(ostate, buf, 0); + for (i = 0; i < 8; i++) + output[i] = swab32(ostate[i]); +} + + +#ifdef HAVE_SHA256_4WAY + +static const uint32_t keypad_4way[4 * 12] = { + 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000280, 0x00000280, 0x00000280, 0x00000280 +}; +static const uint32_t innerpad_4way[4 * 11] = { + 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x000004a0, 0x000004a0, 0x000004a0, 0x000004a0 +}; +static const uint32_t outerpad_4way[4 * 8] = { + 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000300, 0x00000300, 0x00000300, 0x00000300 +}; +static const uint32_t finalblk_4way[4 * 16] __attribute__((aligned(16))) = { + 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000620, 0x00000620, 0x00000620, 0x00000620 +}; + +static inline void HMAC_SHA256_80_init_4way(const uint32_t *key, + uint32_t *tstate, uint32_t *ostate) +{ + uint32_t ihash[4 * 8] __attribute__((aligned(16))); + uint32_t pad[4 * 16] __attribute__((aligned(16))); + int i; + + /* tstate is assumed to contain the midstate of key */ + memcpy(pad, key + 4 * 16, 4 * 16); + memcpy(pad + 4 * 4, keypad_4way, 4 * 48); + sha256_transform_4way(tstate, pad, 0); + memcpy(ihash, tstate, 4 * 32); + + sha256_init_4way(ostate); + for (i = 0; i < 4 * 8; i++) + pad[i] = ihash[i] ^ 0x5c5c5c5c; + for (; i < 4 * 16; i++) + pad[i] = 0x5c5c5c5c; + sha256_transform_4way(ostate, pad, 0); + + sha256_init_4way(tstate); + for (i = 0; i < 4 * 8; i++) + pad[i] = ihash[i] ^ 0x36363636; + for (; i < 4 * 16; i++) + pad[i] = 0x36363636; + sha256_transform_4way(tstate, pad, 0); +} + +static inline void PBKDF2_SHA256_80_128_4way(const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t istate[4 * 8] __attribute__((aligned(16))); + uint32_t ostate2[4 * 8] __attribute__((aligned(16))); + uint32_t ibuf[4 * 16] __attribute__((aligned(16))); + uint32_t obuf[4 * 16] __attribute__((aligned(16))); + int i, j; + + memcpy(istate, tstate, 4 * 32); + sha256_transform_4way(istate, salt, 0); + + memcpy(ibuf, salt + 4 * 16, 4 * 16); + memcpy(ibuf + 4 * 5, innerpad_4way, 4 * 44); + memcpy(obuf + 4 * 8, outerpad_4way, 4 * 32); + + for (i = 0; i < 4; i++) { + memcpy(obuf, istate, 4 * 32); + ibuf[4 * 4 + 0] = i + 1; + ibuf[4 * 4 + 1] = i + 1; + ibuf[4 * 4 + 2] = i + 1; + ibuf[4 * 4 + 3] = i + 1; + sha256_transform_4way(obuf, ibuf, 0); + + memcpy(ostate2, ostate, 4 * 32); + sha256_transform_4way(ostate2, obuf, 0); + for (j = 0; j < 4 * 8; j++) + output[4 * 8 * i + j] = swab32(ostate2[j]); + } +} + +static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, + uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t buf[4 * 16] __attribute__((aligned(16))); + int i; + + sha256_transform_4way(tstate, salt, 1); + sha256_transform_4way(tstate, salt + 4 * 16, 1); + sha256_transform_4way(tstate, finalblk_4way, 0); + memcpy(buf, tstate, 4 * 32); + memcpy(buf + 4 * 8, outerpad_4way, 4 * 32); + + sha256_transform_4way(ostate, buf, 0); + for (i = 0; i < 4 * 8; i++) + output[i] = swab32(ostate[i]); +} + +#endif /* HAVE_SHA256_4WAY */ + + +#ifdef HAVE_SHA256_8WAY + +static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = { + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620 +}; + +static inline void HMAC_SHA256_80_init_8way(const uint32_t *key, + uint32_t *tstate, uint32_t *ostate) +{ + uint32_t ihash[8 * 8] __attribute__((aligned(32))); + uint32_t pad[8 * 16] __attribute__((aligned(32))); + int i; + + /* tstate is assumed to contain the midstate of key */ + memcpy(pad, key + 8 * 16, 8 * 16); + for (i = 0; i < 8; i++) + pad[8 * 4 + i] = 0x80000000; + memset(pad + 8 * 5, 0x00, 8 * 40); + for (i = 0; i < 8; i++) + pad[8 * 15 + i] = 0x00000280; + sha256_transform_8way(tstate, pad, 0); + memcpy(ihash, tstate, 8 * 32); + + sha256_init_8way(ostate); + for (i = 0; i < 8 * 8; i++) + pad[i] = ihash[i] ^ 0x5c5c5c5c; + for (; i < 8 * 16; i++) + pad[i] = 0x5c5c5c5c; + sha256_transform_8way(ostate, pad, 0); + + sha256_init_8way(tstate); + for (i = 0; i < 8 * 8; i++) + pad[i] = ihash[i] ^ 0x36363636; + for (; i < 8 * 16; i++) + pad[i] = 0x36363636; + sha256_transform_8way(tstate, pad, 0); +} + +static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t istate[8 * 8] __attribute__((aligned(32))); + uint32_t ostate2[8 * 8] __attribute__((aligned(32))); + uint32_t ibuf[8 * 16] __attribute__((aligned(32))); + uint32_t obuf[8 * 16] __attribute__((aligned(32))); + int i, j; + + memcpy(istate, tstate, 8 * 32); + sha256_transform_8way(istate, salt, 0); + + memcpy(ibuf, salt + 8 * 16, 8 * 16); + for (i = 0; i < 8; i++) + ibuf[8 * 5 + i] = 0x80000000; + memset(ibuf + 8 * 6, 0x00, 8 * 36); + for (i = 0; i < 8; i++) + ibuf[8 * 15 + i] = 0x000004a0; + + for (i = 0; i < 8; i++) + obuf[8 * 8 + i] = 0x80000000; + memset(obuf + 8 * 9, 0x00, 8 * 24); + for (i = 0; i < 8; i++) + obuf[8 * 15 + i] = 0x00000300; + + for (i = 0; i < 4; i++) { + memcpy(obuf, istate, 8 * 32); + ibuf[8 * 4 + 0] = i + 1; + ibuf[8 * 4 + 1] = i + 1; + ibuf[8 * 4 + 2] = i + 1; + ibuf[8 * 4 + 3] = i + 1; + ibuf[8 * 4 + 4] = i + 1; + ibuf[8 * 4 + 5] = i + 1; + ibuf[8 * 4 + 6] = i + 1; + ibuf[8 * 4 + 7] = i + 1; + sha256_transform_8way(obuf, ibuf, 0); + + memcpy(ostate2, ostate, 8 * 32); + sha256_transform_8way(ostate2, obuf, 0); + for (j = 0; j < 8 * 8; j++) + output[8 * 8 * i + j] = swab32(ostate2[j]); + } +} + +static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate, + uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t buf[8 * 16] __attribute__((aligned(32))); + int i; + + sha256_transform_8way(tstate, salt, 1); + sha256_transform_8way(tstate, salt + 8 * 16, 1); + sha256_transform_8way(tstate, finalblk_8way, 0); + + memcpy(buf, tstate, 8 * 32); + for (i = 0; i < 8; i++) + buf[8 * 8 + i] = 0x80000000; + memset(buf + 8 * 9, 0x00, 8 * 24); + for (i = 0; i < 8; i++) + buf[8 * 15 + i] = 0x00000300; + sha256_transform_8way(ostate, buf, 0); + + for (i = 0; i < 8 * 8; i++) + output[i] = swab32(ostate[i]); +} + +#endif /* HAVE_SHA256_8WAY */ + + +#if defined(__x86_64__) + +#define SCRYPT_MAX_WAYS 12 +#define HAVE_SCRYPT_3WAY 1 +int scrypt_best_throughput(); +void scrypt_core(uint32_t *X, uint32_t *V); +void scrypt_core_3way(uint32_t *X, uint32_t *V); +#if defined(USE_AVX2) +#undef SCRYPT_MAX_WAYS +#define SCRYPT_MAX_WAYS 24 +#define HAVE_SCRYPT_6WAY 1 +void scrypt_core_6way(uint32_t *X, uint32_t *V); +#endif + +#elif defined(__i386__) + +#define SCRYPT_MAX_WAYS 4 +#define scrypt_best_throughput() 1 +void scrypt_core(uint32_t *X, uint32_t *V); + +#elif defined(__arm__) && defined(__APCS_32__) + +void scrypt_core(uint32_t *X, uint32_t *V); +#if defined(__ARM_NEON__) +#undef HAVE_SHA256_4WAY +#define SCRYPT_MAX_WAYS 3 +#define HAVE_SCRYPT_3WAY 1 +#define scrypt_best_throughput() 3 +void scrypt_core_3way(uint32_t *X, uint32_t *V); +#endif + +#else + +static inline void xor_salsa8(uint32_t B[16], const uint32_t Bx[16]) +{ + uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15; + int i; + + x00 = (B[ 0] ^= Bx[ 0]); + x01 = (B[ 1] ^= Bx[ 1]); + x02 = (B[ 2] ^= Bx[ 2]); + x03 = (B[ 3] ^= Bx[ 3]); + x04 = (B[ 4] ^= Bx[ 4]); + x05 = (B[ 5] ^= Bx[ 5]); + x06 = (B[ 6] ^= Bx[ 6]); + x07 = (B[ 7] ^= Bx[ 7]); + x08 = (B[ 8] ^= Bx[ 8]); + x09 = (B[ 9] ^= Bx[ 9]); + x10 = (B[10] ^= Bx[10]); + x11 = (B[11] ^= Bx[11]); + x12 = (B[12] ^= Bx[12]); + x13 = (B[13] ^= Bx[13]); + x14 = (B[14] ^= Bx[14]); + x15 = (B[15] ^= Bx[15]); + for (i = 0; i < 8; i += 2) { +#define R(a, b) (((a) << (b)) | ((a) >> (32 - (b)))) + /* Operate on columns. */ + x04 ^= R(x00+x12, 7); x09 ^= R(x05+x01, 7); + x14 ^= R(x10+x06, 7); x03 ^= R(x15+x11, 7); + + x08 ^= R(x04+x00, 9); x13 ^= R(x09+x05, 9); + x02 ^= R(x14+x10, 9); x07 ^= R(x03+x15, 9); + + x12 ^= R(x08+x04,13); x01 ^= R(x13+x09,13); + x06 ^= R(x02+x14,13); x11 ^= R(x07+x03,13); + + x00 ^= R(x12+x08,18); x05 ^= R(x01+x13,18); + x10 ^= R(x06+x02,18); x15 ^= R(x11+x07,18); + + /* Operate on rows. */ + x01 ^= R(x00+x03, 7); x06 ^= R(x05+x04, 7); + x11 ^= R(x10+x09, 7); x12 ^= R(x15+x14, 7); + + x02 ^= R(x01+x00, 9); x07 ^= R(x06+x05, 9); + x08 ^= R(x11+x10, 9); x13 ^= R(x12+x15, 9); + + x03 ^= R(x02+x01,13); x04 ^= R(x07+x06,13); + x09 ^= R(x08+x11,13); x14 ^= R(x13+x12,13); + + x00 ^= R(x03+x02,18); x05 ^= R(x04+x07,18); + x10 ^= R(x09+x08,18); x15 ^= R(x14+x13,18); +#undef R + } + B[ 0] += x00; + B[ 1] += x01; + B[ 2] += x02; + B[ 3] += x03; + B[ 4] += x04; + B[ 5] += x05; + B[ 6] += x06; + B[ 7] += x07; + B[ 8] += x08; + B[ 9] += x09; + B[10] += x10; + B[11] += x11; + B[12] += x12; + B[13] += x13; + B[14] += x14; + B[15] += x15; +} + +static inline void scrypt_core(uint32_t *X, uint32_t *V) +{ + uint32_t i, j, k; + + for (i = 0; i < 1024; i++) { + memcpy(&V[i * 32], X, 128); + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); + } + for (i = 0; i < 1024; i++) { + j = 32 * (X[16] & 1023); + for (k = 0; k < 32; k++) + X[k] ^= V[j + k]; + xor_salsa8(&X[0], &X[16]); + xor_salsa8(&X[16], &X[0]); + } +} + +#endif + +#ifndef SCRYPT_MAX_WAYS +#define SCRYPT_MAX_WAYS 1 +#define scrypt_best_throughput() 1 +#endif + +#define SCRYPT_BUFFER_SIZE (SCRYPT_MAX_WAYS * 131072 + 63) + +unsigned char *scrypt_buffer_alloc() +{ + return malloc(SCRYPT_BUFFER_SIZE); +} + +static void scrypt_1024_1_1_256(const uint32_t *input, uint32_t *output, + uint32_t *midstate, unsigned char *scratchpad) +{ + uint32_t tstate[8], ostate[8]; + uint32_t X[32]; + uint32_t *V; + + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + memcpy(tstate, midstate, 32); + HMAC_SHA256_80_init(input, tstate, ostate); + PBKDF2_SHA256_80_128(tstate, ostate, input, X); + + scrypt_core(X, V); + + PBKDF2_SHA256_128_32(tstate, ostate, X, output); +} + +#ifdef HAVE_SHA256_4WAY +static void scrypt_1024_1_1_256_4way(const uint32_t *input, + uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) +{ + uint32_t tstate[4 * 8] __attribute__((aligned(128))); + uint32_t ostate[4 * 8] __attribute__((aligned(128))); + uint32_t W[4 * 32] __attribute__((aligned(128))); + uint32_t X[4 * 32] __attribute__((aligned(128))); + uint32_t *V; + int i, k; + + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + for (i = 0; i < 20; i++) + for (k = 0; k < 4; k++) + W[4 * i + k] = input[k * 20 + i]; + for (i = 0; i < 8; i++) + for (k = 0; k < 4; k++) + tstate[4 * i + k] = midstate[i]; + HMAC_SHA256_80_init_4way(W, tstate, ostate); + PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); + for (i = 0; i < 32; i++) + for (k = 0; k < 4; k++) + X[k * 32 + i] = W[4 * i + k]; + scrypt_core(X + 0 * 32, V); + scrypt_core(X + 1 * 32, V); + scrypt_core(X + 2 * 32, V); + scrypt_core(X + 3 * 32, V); + for (i = 0; i < 32; i++) + for (k = 0; k < 4; k++) + W[4 * i + k] = X[k * 32 + i]; + PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); + for (i = 0; i < 8; i++) + for (k = 0; k < 4; k++) + output[k * 8 + i] = W[4 * i + k]; +} +#endif /* HAVE_SHA256_4WAY */ + +#ifdef HAVE_SCRYPT_3WAY + +static void scrypt_1024_1_1_256_3way(const uint32_t *input, + uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) +{ + uint32_t tstate[3 * 8], ostate[3 * 8]; + uint32_t X[3 * 32] __attribute__((aligned(64))); + uint32_t *V; + + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + memcpy(tstate + 0, midstate, 32); + memcpy(tstate + 8, midstate, 32); + memcpy(tstate + 16, midstate, 32); + HMAC_SHA256_80_init(input + 0, tstate + 0, ostate + 0); + HMAC_SHA256_80_init(input + 20, tstate + 8, ostate + 8); + HMAC_SHA256_80_init(input + 40, tstate + 16, ostate + 16); + PBKDF2_SHA256_80_128(tstate + 0, ostate + 0, input + 0, X + 0); + PBKDF2_SHA256_80_128(tstate + 8, ostate + 8, input + 20, X + 32); + PBKDF2_SHA256_80_128(tstate + 16, ostate + 16, input + 40, X + 64); + + scrypt_core_3way(X, V); + + PBKDF2_SHA256_128_32(tstate + 0, ostate + 0, X + 0, output + 0); + PBKDF2_SHA256_128_32(tstate + 8, ostate + 8, X + 32, output + 8); + PBKDF2_SHA256_128_32(tstate + 16, ostate + 16, X + 64, output + 16); +} + +#ifdef HAVE_SHA256_4WAY +static void scrypt_1024_1_1_256_12way(const uint32_t *input, + uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) +{ + uint32_t tstate[12 * 8] __attribute__((aligned(128))); + uint32_t ostate[12 * 8] __attribute__((aligned(128))); + uint32_t W[12 * 32] __attribute__((aligned(128))); + uint32_t X[12 * 32] __attribute__((aligned(128))); + uint32_t *V; + int i, j, k; + + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + for (j = 0; j < 3; j++) + for (i = 0; i < 20; i++) + for (k = 0; k < 4; k++) + W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i]; + for (j = 0; j < 3; j++) + for (i = 0; i < 8; i++) + for (k = 0; k < 4; k++) + tstate[32 * j + 4 * i + k] = midstate[i]; + HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0); + HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32); + HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64); + PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0); + PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128); + PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256); + for (j = 0; j < 3; j++) + for (i = 0; i < 32; i++) + for (k = 0; k < 4; k++) + X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k]; + scrypt_core_3way(X + 0 * 96, V); + scrypt_core_3way(X + 1 * 96, V); + scrypt_core_3way(X + 2 * 96, V); + scrypt_core_3way(X + 3 * 96, V); + for (j = 0; j < 3; j++) + for (i = 0; i < 32; i++) + for (k = 0; k < 4; k++) + W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i]; + PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0); + PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128); + PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256); + for (j = 0; j < 3; j++) + for (i = 0; i < 8; i++) + for (k = 0; k < 4; k++) + output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k]; +} +#endif /* HAVE_SHA256_4WAY */ + +#endif /* HAVE_SCRYPT_3WAY */ + +#ifdef HAVE_SCRYPT_6WAY +static void scrypt_1024_1_1_256_24way(const uint32_t *input, + uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) +{ + uint32_t tstate[24 * 8] __attribute__((aligned(128))); + uint32_t ostate[24 * 8] __attribute__((aligned(128))); + uint32_t W[24 * 32] __attribute__((aligned(128))); + uint32_t X[24 * 32] __attribute__((aligned(128))); + uint32_t *V; + int i, j, k; + + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + for (j = 0; j < 3; j++) + for (i = 0; i < 20; i++) + for (k = 0; k < 8; k++) + W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i]; + for (j = 0; j < 3; j++) + for (i = 0; i < 8; i++) + for (k = 0; k < 8; k++) + tstate[8 * 8 * j + 8 * i + k] = midstate[i]; + HMAC_SHA256_80_init_8way(W + 0, tstate + 0, ostate + 0); + HMAC_SHA256_80_init_8way(W + 256, tstate + 64, ostate + 64); + HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128); + PBKDF2_SHA256_80_128_8way(tstate + 0, ostate + 0, W + 0, W + 0); + PBKDF2_SHA256_80_128_8way(tstate + 64, ostate + 64, W + 256, W + 256); + PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512); + for (j = 0; j < 3; j++) + for (i = 0; i < 32; i++) + for (k = 0; k < 8; k++) + X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k]; + scrypt_core_6way(X + 0 * 32, V); + scrypt_core_6way(X + 6 * 32, V); + scrypt_core_6way(X + 12 * 32, V); + scrypt_core_6way(X + 18 * 32, V); + for (j = 0; j < 3; j++) + for (i = 0; i < 32; i++) + for (k = 0; k < 8; k++) + W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i]; + PBKDF2_SHA256_128_32_8way(tstate + 0, ostate + 0, W + 0, W + 0); + PBKDF2_SHA256_128_32_8way(tstate + 64, ostate + 64, W + 256, W + 256); + PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512); + for (j = 0; j < 3; j++) + for (i = 0; i < 8; i++) + for (k = 0; k < 8; k++) + output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k]; +} +#endif /* HAVE_SCRYPT_6WAY */ + +int scanhash_scrypt(int thr_id, uint32_t *pdata, + unsigned char *scratchbuf, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[SCRYPT_MAX_WAYS * 20], hash[SCRYPT_MAX_WAYS * 8]; + uint32_t midstate[8]; + uint32_t n = pdata[19] - 1; + const uint32_t Htarg = ptarget[7]; + int throughput = scrypt_best_throughput(); + int i; + +#ifdef HAVE_SHA256_4WAY + if (sha256_use_4way()) + throughput *= 4; +#endif + + for (i = 0; i < throughput; i++) + memcpy(data + i * 20, pdata, 80); + + sha256_init(midstate); + sha256_transform(midstate, data, 0); + + do { + for (i = 0; i < throughput; i++) + data[i * 20 + 19] = ++n; + +#if defined(HAVE_SHA256_4WAY) + if (throughput == 4) + scrypt_1024_1_1_256_4way(data, hash, midstate, scratchbuf); + else +#endif +#if defined(HAVE_SCRYPT_3WAY) && defined(HAVE_SHA256_4WAY) + if (throughput == 12) + scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf); + else +#endif +#if defined(HAVE_SCRYPT_6WAY) + if (throughput == 24) + scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf); + else +#endif +#if defined(HAVE_SCRYPT_3WAY) + if (throughput == 3) + scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf); + else +#endif + scrypt_1024_1_1_256(data, hash, midstate, scratchbuf); + + for (i = 0; i < throughput; i++) { + if (hash[i * 8 + 7] <= Htarg && fulltest(hash + i * 8, ptarget)) { + *hashes_done = n - pdata[19] + 1; + pdata[19] = data[i * 20 + 19]; + return 1; + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - pdata[19] + 1; + pdata[19] = n; + return 0; +} diff --git a/scryptjane/scrypt-jane-chacha.h b/scryptjane/scrypt-jane-chacha.h new file mode 100644 index 00000000..41d96e5e --- /dev/null +++ b/scryptjane/scrypt-jane-chacha.h @@ -0,0 +1,132 @@ +#define SCRYPT_MIX_BASE "ChaCha20/8" + +typedef uint32_t scrypt_mix_word_t; + +#define SCRYPT_WORDTO8_LE U32TO8_LE +#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP + +#define SCRYPT_BLOCK_BYTES 64 +#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + +/* must have these here in case block bytes is ever != 64 */ +#include "scrypt-jane-romix-basic.h" + +#include "scrypt-jane-mix_chacha-avx.h" +#include "scrypt-jane-mix_chacha-ssse3.h" +#include "scrypt-jane-mix_chacha-sse2.h" +#include "scrypt-jane-mix_chacha.h" + +#if defined(SCRYPT_CHACHA_AVX) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx + #define SCRYPT_ROMIX_FN scrypt_ROMix_avx + #define SCRYPT_MIX_FN chacha_core_avx + #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop + #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_ssse3 + #define SCRYPT_ROMIX_FN scrypt_ROMix_ssse3 + #define SCRYPT_MIX_FN chacha_core_ssse3 + #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop + #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 + #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 + #define SCRYPT_MIX_FN chacha_core_sse2 + #define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_nop + #define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_nop + #include "scrypt-jane-romix-template.h" +#endif + +/* cpu agnostic */ +#define SCRYPT_ROMIX_FN scrypt_ROMix_basic +#define SCRYPT_MIX_FN chacha_core_basic +#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian +#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian +#include "scrypt-jane-romix-template.h" + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +static scrypt_ROMixfn +scrypt_getROMix() { + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_CHACHA_AVX) + if (cpuflags & cpu_avx) + return scrypt_ROMix_avx; + else +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + if (cpuflags & cpu_ssse3) + return scrypt_ROMix_ssse3; + else +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + if (cpuflags & cpu_sse2) + return scrypt_ROMix_sse2; + else +#endif + + return scrypt_ROMix_basic; +} +#endif + + +#if defined(SCRYPT_TEST_SPEED) +static size_t +available_implementations() { + size_t flags = 0; + +#if defined(SCRYPT_CHACHA_AVX) + flags |= cpu_avx; +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + flags |= cpu_ssse3; +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + flags |= cpu_sse2; +#endif + + return flags; +} +#endif + +static int +scrypt_test_mix() { + static const uint8_t expected[16] = { + 0x48,0x2b,0x2d,0xb8,0xa1,0x33,0x22,0x73,0xcd,0x16,0xc4,0xb4,0xb0,0x7f,0xb1,0x8a, + }; + + int ret = 1; + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_CHACHA_AVX) + if (cpuflags & cpu_avx) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, scrypt_romix_nop, scrypt_romix_nop, expected); +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + if (cpuflags & cpu_ssse3) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_ssse3, scrypt_romix_nop, scrypt_romix_nop, expected); +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + if (cpuflags & cpu_sse2) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, scrypt_romix_nop, scrypt_romix_nop, expected); +#endif + +#if defined(SCRYPT_CHACHA_BASIC) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); +#endif + + return ret; +} + diff --git a/scryptjane/scrypt-jane-hash.h b/scryptjane/scrypt-jane-hash.h new file mode 100644 index 00000000..db5c1db3 --- /dev/null +++ b/scryptjane/scrypt-jane-hash.h @@ -0,0 +1,48 @@ +#if defined(SCRYPT_BLAKE512) +#include "scrypt-jane-hash_blake512.h" +#elif defined(SCRYPT_BLAKE256) +#include "scrypt-jane-hash_blake256.h" +#elif defined(SCRYPT_SHA512) +#include "scrypt-jane-hash_sha512.h" +#elif defined(SCRYPT_SHA256) +#include "scrypt-jane-hash_sha256.h" +#elif defined(SCRYPT_SKEIN512) +#include "scrypt-jane-hash_skein512.h" +#elif defined(SCRYPT_KECCAK512) || defined(SCRYPT_KECCAK256) +#include "scrypt-jane-hash_keccak.h" +#else + #define SCRYPT_HASH "ERROR" + #define SCRYPT_HASH_BLOCK_SIZE 64 + #define SCRYPT_HASH_DIGEST_SIZE 64 + typedef struct scrypt_hash_state_t { size_t dummy; } scrypt_hash_state; + typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + static void scrypt_hash_init(scrypt_hash_state *S) {} + static void scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) {} + static void scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) {} + static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = {0}; + #error must define a hash function! +#endif + +#include "scrypt-jane-pbkdf2.h" + +#define SCRYPT_TEST_HASH_LEN 257 /* (2 * largest block size) + 1 */ + +static int +scrypt_test_hash() { + scrypt_hash_state st; + scrypt_hash_digest hash, final; + uint8_t msg[SCRYPT_TEST_HASH_LEN]; + size_t i; + + for (i = 0; i < SCRYPT_TEST_HASH_LEN; i++) + msg[i] = (uint8_t)i; + + scrypt_hash_init(&st); + for (i = 0; i < SCRYPT_TEST_HASH_LEN + 1; i++) { + scrypt_hash(hash, msg, i); + scrypt_hash_update(&st, hash, sizeof(hash)); + } + scrypt_hash_finish(&st, final); + return scrypt_verify(final, scrypt_test_hash_expected, SCRYPT_HASH_DIGEST_SIZE); +} + diff --git a/scryptjane/scrypt-jane-hash_keccak.h b/scryptjane/scrypt-jane-hash_keccak.h new file mode 100644 index 00000000..7ed55747 --- /dev/null +++ b/scryptjane/scrypt-jane-hash_keccak.h @@ -0,0 +1,168 @@ +#if defined(SCRYPT_KECCAK256) + #define SCRYPT_HASH "Keccak-256" + #define SCRYPT_HASH_DIGEST_SIZE 32 +#else + #define SCRYPT_HASH "Keccak-512" + #define SCRYPT_HASH_DIGEST_SIZE 64 +#endif +#define SCRYPT_KECCAK_F 1600 +#define SCRYPT_KECCAK_C (SCRYPT_HASH_DIGEST_SIZE * 8 * 2) /* 256=512, 512=1024 */ +#define SCRYPT_KECCAK_R (SCRYPT_KECCAK_F - SCRYPT_KECCAK_C) /* 256=1088, 512=576 */ +#define SCRYPT_HASH_BLOCK_SIZE (SCRYPT_KECCAK_R / 8) + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +typedef struct scrypt_hash_state_t { + uint64_t state[SCRYPT_KECCAK_F / 64]; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +static const uint64_t keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull +}; + +static void +keccak_block(scrypt_hash_state *S, const uint8_t *in) { + size_t i; + uint64_t *s = S->state, t[5], u[5], v, w; + + /* absorb input */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8) + s[i] ^= U8TO64_LE(in); + + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROTL64(t[1], 1); + u[1] = t[0] ^ ROTL64(t[2], 1); + u[2] = t[1] ^ ROTL64(t[3], 1); + u[3] = t[2] ^ ROTL64(t[4], 1); + u[4] = t[3] ^ ROTL64(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[ 1]; + s[ 1] = ROTL64(s[ 6], 44); + s[ 6] = ROTL64(s[ 9], 20); + s[ 9] = ROTL64(s[22], 61); + s[22] = ROTL64(s[14], 39); + s[14] = ROTL64(s[20], 18); + s[20] = ROTL64(s[ 2], 62); + s[ 2] = ROTL64(s[12], 43); + s[12] = ROTL64(s[13], 25); + s[13] = ROTL64(s[19], 8); + s[19] = ROTL64(s[23], 56); + s[23] = ROTL64(s[15], 41); + s[15] = ROTL64(s[ 4], 27); + s[ 4] = ROTL64(s[24], 14); + s[24] = ROTL64(s[21], 2); + s[21] = ROTL64(s[ 8], 55); + s[ 8] = ROTL64(s[16], 45); + s[16] = ROTL64(s[ 5], 36); + s[ 5] = ROTL64(s[ 3], 28); + s[ 3] = ROTL64(s[18], 21); + s[18] = ROTL64(s[17], 15); + s[17] = ROTL64(s[11], 10); + s[11] = ROTL64(s[ 7], 6); + s[ 7] = ROTL64(s[10], 3); + s[10] = ROTL64( v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; + v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; + } +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + memset(S, 0, sizeof(*S)); +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + keccak_block(S, S->buffer); + } + + /* handle the current data */ + while (inlen >= SCRYPT_HASH_BLOCK_SIZE) { + keccak_block(S, in); + in += SCRYPT_HASH_BLOCK_SIZE; + inlen -= SCRYPT_HASH_BLOCK_SIZE; + } + + /* handle leftover data */ + S->leftover = (uint32_t)inlen; + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + size_t i; + + S->buffer[S->leftover] = 0x01; + memset(S->buffer + (S->leftover + 1), 0, SCRYPT_HASH_BLOCK_SIZE - (S->leftover + 1)); + S->buffer[SCRYPT_HASH_BLOCK_SIZE - 1] |= 0x80; + keccak_block(S, S->buffer); + + for (i = 0; i < SCRYPT_HASH_DIGEST_SIZE; i += 8) { + U64TO8_LE(&hash[i], S->state[i / 8]); + } +} + +#if defined(SCRYPT_KECCAK256) +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0x26,0xb7,0x10,0xb3,0x66,0xb1,0xd1,0xb1,0x25,0xfc,0x3e,0xe3,0x1e,0x33,0x1d,0x19, + 0x94,0xaa,0x63,0x7a,0xd5,0x77,0x29,0xb4,0x27,0xe9,0xe0,0xf4,0x19,0xba,0x68,0xea, +}; +#else +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0x17,0xc7,0x8c,0xa0,0xd9,0x08,0x1d,0xba,0x8a,0xc8,0x3e,0x07,0x90,0xda,0x91,0x88, + 0x25,0xbd,0xd3,0xf8,0x78,0x4a,0x8d,0x5e,0xe4,0x96,0x9c,0x01,0xf3,0xeb,0xdc,0x12, + 0xea,0x35,0x57,0xba,0x94,0xb8,0xe9,0xb9,0x27,0x45,0x0a,0x48,0x5c,0x3d,0x69,0xf0, + 0xdb,0x22,0x38,0xb5,0x52,0x22,0x29,0xea,0x7a,0xb2,0xe6,0x07,0xaa,0x37,0x4d,0xe6, +}; +#endif + diff --git a/scryptjane/scrypt-jane-hash_sha256.h b/scryptjane/scrypt-jane-hash_sha256.h new file mode 100644 index 00000000..d06d3e1b --- /dev/null +++ b/scryptjane/scrypt-jane-hash_sha256.h @@ -0,0 +1,135 @@ +#define SCRYPT_HASH "SHA-2-256" +#define SCRYPT_HASH_BLOCK_SIZE 64 +#define SCRYPT_HASH_DIGEST_SIZE 32 + +typedef uint8_t scrypt_hash_digest[SCRYPT_HASH_DIGEST_SIZE]; + +typedef struct scrypt_hash_state_t { + uint32_t H[8]; + uint64_t T; + uint32_t leftover; + uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE]; +} scrypt_hash_state; + +static const uint32_t sha256_constants[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +#define Ch(x,y,z) (z ^ (x & (y ^ z))) +#define Maj(x,y,z) (((x | y) & z) | (x & y)) +#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) +#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) +#define G0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ (x >> 3)) +#define G1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10)) +#define W0(in,i) (U8TO32_BE(&in[i * 4])) +#define W1(i) (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16]) +#define STEP(i) \ + t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \ + t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \ + r[7] = r[6]; \ + r[6] = r[5]; \ + r[5] = r[4]; \ + r[4] = r[3] + t0; \ + r[3] = r[2]; \ + r[2] = r[1]; \ + r[1] = r[0]; \ + r[0] = t0 + t1; + +static void +sha256_blocks(scrypt_hash_state *S, const uint8_t *in, size_t blocks) { + uint32_t r[8], w[64], t0, t1; + size_t i; + + for (i = 0; i < 8; i++) r[i] = S->H[i]; + + while (blocks--) { + for (i = 0; i < 16; i++) { w[i] = W0(in, i); } + for (i = 16; i < 64; i++) { w[i] = W1(i); } + for (i = 0; i < 64; i++) { STEP(i); } + for (i = 0; i < 8; i++) { r[i] += S->H[i]; S->H[i] = r[i]; } + S->T += SCRYPT_HASH_BLOCK_SIZE * 8; + in += SCRYPT_HASH_BLOCK_SIZE; + } +} + +static void +scrypt_hash_init(scrypt_hash_state *S) { + S->H[0] = 0x6a09e667; + S->H[1] = 0xbb67ae85; + S->H[2] = 0x3c6ef372; + S->H[3] = 0xa54ff53a; + S->H[4] = 0x510e527f; + S->H[5] = 0x9b05688c; + S->H[6] = 0x1f83d9ab; + S->H[7] = 0x5be0cd19; + S->T = 0; + S->leftover = 0; +} + +static void +scrypt_hash_update(scrypt_hash_state *S, const uint8_t *in, size_t inlen) { + size_t blocks, want; + + /* handle the previous data */ + if (S->leftover) { + want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover); + want = (want < inlen) ? want : inlen; + memcpy(S->buffer + S->leftover, in, want); + S->leftover += (uint32_t)want; + if (S->leftover < SCRYPT_HASH_BLOCK_SIZE) + return; + in += want; + inlen -= want; + sha256_blocks(S, S->buffer, 1); + } + + /* handle the current data */ + blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1)); + S->leftover = (uint32_t)(inlen - blocks); + if (blocks) { + sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE); + in += blocks; + } + + /* handle leftover data */ + if (S->leftover) + memcpy(S->buffer, in, S->leftover); +} + +static void +scrypt_hash_finish(scrypt_hash_state *S, uint8_t *hash) { + uint64_t t = S->T + (S->leftover * 8); + + S->buffer[S->leftover] = 0x80; + if (S->leftover <= 55) { + memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover); + } else { + memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover); + sha256_blocks(S, S->buffer, 1); + memset(S->buffer, 0, 56); + } + + U64TO8_BE(S->buffer + 56, t); + sha256_blocks(S, S->buffer, 1); + + U32TO8_BE(&hash[ 0], S->H[0]); + U32TO8_BE(&hash[ 4], S->H[1]); + U32TO8_BE(&hash[ 8], S->H[2]); + U32TO8_BE(&hash[12], S->H[3]); + U32TO8_BE(&hash[16], S->H[4]); + U32TO8_BE(&hash[20], S->H[5]); + U32TO8_BE(&hash[24], S->H[6]); + U32TO8_BE(&hash[28], S->H[7]); +} + +static const uint8_t scrypt_test_hash_expected[SCRYPT_HASH_DIGEST_SIZE] = { + 0xee,0x36,0xae,0xa6,0x65,0xf0,0x28,0x7d,0xc9,0xde,0xd8,0xad,0x48,0x33,0x7d,0xbf, + 0xcb,0xc0,0x48,0xfa,0x5f,0x92,0xfd,0x0a,0x95,0x6f,0x34,0x8e,0x8c,0x1e,0x73,0xad, +}; diff --git a/scryptjane/scrypt-jane-mix_chacha-avx.h b/scryptjane/scrypt-jane-mix_chacha-avx.h new file mode 100644 index 00000000..50d6e2d2 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_chacha-avx.h @@ -0,0 +1,340 @@ +/* x86 */ +#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,64) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(vmovdqa xmm4,[ssse3_rotl16_32bit]) + a2(vmovdqa xmm5,[ssse3_rotl8_32bit]) + a2(vmovdqa xmm0,[ecx+esi+0]) + a2(vmovdqa xmm1,[ecx+esi+16]) + a2(vmovdqa xmm2,[ecx+esi+32]) + a2(vmovdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[ecx+eax+0]) + a3(vpxor xmm1,xmm1,[ecx+eax+16]) + a3(vpxor xmm2,xmm2,[ecx+eax+32]) + a3(vpxor xmm3,xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_avx_loop:) + a2(and eax, eax) + a3(vpxor xmm0,xmm0,[esi+ecx+0]) + a3(vpxor xmm1,xmm1,[esi+ecx+16]) + a3(vpxor xmm2,xmm2,[esi+ecx+32]) + a3(vpxor xmm3,xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[eax+ecx+0]) + a3(vpxor xmm1,xmm1,[eax+ecx+16]) + a3(vpxor xmm2,xmm2,[eax+ecx+32]) + a3(vpxor xmm3,xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa [esp+0],xmm0) + a2(vmovdqa [esp+16],xmm1) + a2(vmovdqa [esp+32],xmm2) + a2(vmovdqa [esp+48],xmm3) + a2(mov eax,8) + a1(scrypt_chacha_avx_loop: ) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm4) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpsrld xmm6,xmm1,20) + a3(vpslld xmm1,xmm1,12) + a3(vpxor xmm1,xmm1,xmm6) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm5) + a3(vpshufd xmm0,xmm0,0x93) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpshufd xmm3,xmm3,0x4e) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpshufd xmm2,xmm2,0x39) + a3(vpsrld xmm6,xmm1,25) + a3(vpslld xmm1,xmm1,7) + a3(vpxor xmm1,xmm1,xmm6) + a2(sub eax,2) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm4) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpsrld xmm6,xmm1,20) + a3(vpslld xmm1,xmm1,12) + a3(vpxor xmm1,xmm1,xmm6) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm5) + a3(vpshufd xmm0,xmm0,0x39) + a3(vpaddd xmm2,xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a3(vpxor xmm1,xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a3(vpsrld xmm6,xmm1,25) + a3(vpslld xmm1,xmm1,7) + a3(vpxor xmm1,xmm1,xmm6) + a1(ja scrypt_chacha_avx_loop) + a3(vpaddd xmm0,xmm0,[esp+0]) + a3(vpaddd xmm1,xmm1,[esp+16]) + a3(vpaddd xmm2,xmm2,[esp+32]) + a3(vpaddd xmm3,xmm3,[esp+48]) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(vmovdqa [eax+0],xmm0) + a2(vmovdqa [eax+16],xmm1) + a2(vmovdqa [eax+32],xmm2) + a2(vmovdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_avx_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm4,[ssse3_rotl16_32bit]) + a2(vmovdqa xmm5,[ssse3_rotl8_32bit]) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor r8,r8) + a2(xor r9,r9) + a1(scrypt_ChunkMix_avx_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa xmm8,xmm0) + a2(vmovdqa xmm9,xmm1) + a2(vmovdqa xmm10,xmm2) + a2(vmovdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_chacha_avx_loop: ) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm4) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpsrld xmm12,xmm1,20) + a3(vpslld xmm1,xmm1,12) + a3(vpxor xmm1,xmm1,xmm12) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm5) + a3(vpshufd xmm0,xmm0,0x93) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpshufd xmm3,xmm3,0x4e) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpshufd xmm2,xmm2,0x39) + a3(vpsrld xmm12,xmm1,25) + a3(vpslld xmm1,xmm1,7) + a3(vpxor xmm1,xmm1,xmm12) + a2(sub rax,2) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm4) + a3(vpaddd xmm2,xmm2,xmm3) + a3(vpxor xmm1,xmm1,xmm2) + a3(vpsrld xmm12,xmm1,20) + a3(vpslld xmm1,xmm1,12) + a3(vpxor xmm1,xmm1,xmm12) + a3(vpaddd xmm0,xmm0,xmm1) + a3(vpxor xmm3,xmm3,xmm0) + a3(vpshufb xmm3,xmm3,xmm5) + a3(vpshufd xmm0,xmm0,0x39) + a3(vpaddd xmm2,xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a3(vpxor xmm1,xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a3(vpsrld xmm12,xmm1,25) + a3(vpslld xmm1,xmm1,7) + a3(vpxor xmm1,xmm1,xmm12) + a1(ja scrypt_chacha_avx_loop) + a3(vpaddd xmm0,xmm0,xmm8) + a3(vpaddd xmm1,xmm1,xmm9) + a3(vpaddd xmm2,xmm2,xmm10) + a3(vpaddd xmm3,xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_avx_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_AVX + +static void NOINLINE +scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; + const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_CHACHA_AVX) + #undef SCRYPT_MIX + #define SCRYPT_MIX "ChaCha/8-AVX" + #undef SCRYPT_CHACHA_INCLUDED + #define SCRYPT_CHACHA_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_chacha-sse2.h b/scryptjane/scrypt-jane-mix_chacha-sse2.h new file mode 100644 index 00000000..d2192c8f --- /dev/null +++ b/scryptjane/scrypt-jane-mix_chacha-sse2.h @@ -0,0 +1,371 @@ +/* x86 */ +#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,16) + a2(and esp,~15) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[ecx+eax+0]) + a2(pxor xmm1,[ecx+eax+16]) + a2(pxor xmm2,[ecx+eax+32]) + a2(pxor xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and eax, eax) + a2(pxor xmm0,[esi+ecx+0]) + a2(pxor xmm1,[esi+ecx+16]) + a2(pxor xmm2,[esi+ecx+32]) + a2(pxor xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[eax+ecx+0]) + a2(pxor xmm1,[eax+ecx+16]) + a2(pxor xmm2,[eax+ecx+32]) + a2(pxor xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa [esp+0],xmm0) + a2(movdqa xmm4,xmm1) + a2(movdqa xmm5,xmm2) + a2(movdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_chacha_sse2_loop: ) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,16) + a2(psrld xmm6,16) + a2(pxor xmm3,xmm6) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,8) + a2(psrld xmm6,24) + a2(pxor xmm3,xmm6) + a3(pshufd xmm0,xmm0,0x93) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x39) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a2(sub eax,2) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,16) + a2(psrld xmm6,16) + a2(pxor xmm3,xmm6) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,8) + a2(psrld xmm6,24) + a2(pxor xmm3,xmm6) + a3(pshufd xmm0,xmm0,0x39) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a1(ja scrypt_chacha_sse2_loop) + a2(paddd xmm0,[esp+0]) + a2(paddd xmm1,xmm4) + a2(paddd xmm2,xmm5) + a2(paddd xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(movdqa [eax+0],xmm0) + a2(movdqa [eax+16],xmm1) + a2(movdqa [eax+32],xmm2) + a2(movdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_sse2_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa xmm8,xmm0) + a2(movdqa xmm9,xmm1) + a2(movdqa xmm10,xmm2) + a2(movdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_chacha_sse2_loop: ) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,16) + a2(psrld xmm6,16) + a2(pxor xmm3,xmm6) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,8) + a2(psrld xmm6,24) + a2(pxor xmm3,xmm6) + a3(pshufd xmm0,xmm0,0x93) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x39) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a2(sub rax,2) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,16) + a2(psrld xmm6,16) + a2(pxor xmm3,xmm6) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(movdqa xmm6,xmm3) + a2(pslld xmm3,8) + a2(psrld xmm6,24) + a2(pxor xmm3,xmm6) + a3(pshufd xmm0,xmm0,0x39) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a1(ja scrypt_chacha_sse2_loop) + a2(paddd xmm0,xmm8) + a2(paddd xmm1,xmm9) + a2(paddd xmm2,xmm10) + a2(paddd xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_sse2_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSE2 + +static void NOINLINE +scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16)); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x4 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24)); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x4 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_or_si128(_mm_slli_epi32(x3, 16), _mm_srli_epi32(x4, 16)); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x4 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x4, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x4 = x3; + x3 = _mm_or_si128(_mm_slli_epi32(x3, 8), _mm_srli_epi32(x4, 24)); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x4 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x4, 25)); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_CHACHA_SSE2) + #undef SCRYPT_MIX + #define SCRYPT_MIX "ChaCha/8-SSE2" + #undef SCRYPT_CHACHA_INCLUDED + #define SCRYPT_CHACHA_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_chacha-ssse3.h b/scryptjane/scrypt-jane-mix_chacha-ssse3.h new file mode 100644 index 00000000..b25e3567 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_chacha-ssse3.h @@ -0,0 +1,348 @@ +/* x86 */ +#if defined(X86ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSSE3 + +asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_ssse3) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,64) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm4,[ssse3_rotl16_32bit]) + a2(movdqa xmm5,[ssse3_rotl8_32bit]) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_ssse3_no_xor1) + a2(pxor xmm0,[ecx+eax+0]) + a2(pxor xmm1,[ecx+eax+16]) + a2(pxor xmm2,[ecx+eax+32]) + a2(pxor xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_ssse3_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_ssse3_loop:) + a2(and eax, eax) + a2(pxor xmm0,[esi+ecx+0]) + a2(pxor xmm1,[esi+ecx+16]) + a2(pxor xmm2,[esi+ecx+32]) + a2(pxor xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_ssse3_no_xor2) + a2(pxor xmm0,[eax+ecx+0]) + a2(pxor xmm1,[eax+ecx+16]) + a2(pxor xmm2,[eax+ecx+32]) + a2(pxor xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_ssse3_no_xor2:) + a2(movdqa [esp+0],xmm0) + a2(movdqa [esp+16],xmm1) + a2(movdqa [esp+32],xmm2) + a2(movdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_chacha_ssse3_loop: ) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm4) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm5) + a3(pshufd xmm0,xmm0,0x93) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x39) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a2(sub eax,2) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm4) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm6,20) + a2(pxor xmm1,xmm6) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm5) + a3(pshufd xmm0,xmm0,0x39) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a2(movdqa xmm6,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm6,25) + a2(pxor xmm1,xmm6) + a1(ja scrypt_chacha_ssse3_loop) + a2(paddd xmm0,[esp+0]) + a2(paddd xmm1,[esp+16]) + a2(paddd xmm2,[esp+32]) + a2(paddd xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(movdqa [eax+0],xmm0) + a2(movdqa [eax+16],xmm1) + a2(movdqa [eax+32],xmm2) + a2(movdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_ssse3_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_ssse3) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSSE3 + +asm_naked_fn_proto(void, scrypt_ChunkMix_ssse3)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_ssse3) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm4,[ssse3_rotl16_32bit]) + a2(movdqa xmm5,[ssse3_rotl8_32bit]) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_ssse3_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a1(scrypt_ChunkMix_ssse3_no_xor1:) + a2(xor r8,r8) + a2(xor r9,r9) + a1(scrypt_ChunkMix_ssse3_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_ssse3_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_ssse3_no_xor2:) + a2(movdqa xmm8,xmm0) + a2(movdqa xmm9,xmm1) + a2(movdqa xmm10,xmm2) + a2(movdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_chacha_ssse3_loop: ) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm4) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm12,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm12,20) + a2(pxor xmm1,xmm12) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm5) + a3(pshufd xmm0,xmm0,0x93) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x39) + a2(movdqa xmm12,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm12,25) + a2(pxor xmm1,xmm12) + a2(sub rax,2) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm4) + a2(paddd xmm2,xmm3) + a2(pxor xmm1,xmm2) + a2(movdqa xmm12,xmm1) + a2(pslld xmm1,12) + a2(psrld xmm12,20) + a2(pxor xmm1,xmm12) + a2(paddd xmm0,xmm1) + a2(pxor xmm3,xmm0) + a2(pshufb xmm3,xmm5) + a3(pshufd xmm0,xmm0,0x39) + a2(paddd xmm2,xmm3) + a3(pshufd xmm3,xmm3,0x4e) + a2(pxor xmm1,xmm2) + a3(pshufd xmm2,xmm2,0x93) + a2(movdqa xmm12,xmm1) + a2(pslld xmm1,7) + a2(psrld xmm12,25) + a2(pxor xmm1,xmm12) + a1(ja scrypt_chacha_ssse3_loop) + a2(paddd xmm0,xmm8) + a2(paddd xmm1,xmm9) + a2(paddd xmm2,xmm10) + a2(paddd xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_ssse3_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_ssse3) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSSE3) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED)) + +#define SCRYPT_CHACHA_SSSE3 + +static void NOINLINE +scrypt_ChunkMix_ssse3(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x6,t0,t1,t2,t3; + const xmmi x4 = *(xmmi *)&ssse3_rotl16_32bit, x5 = *(xmmi *)&ssse3_rotl8_32bit; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x93); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x39); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x4); + x2 = _mm_add_epi32(x2, x3); + x1 = _mm_xor_si128(x1, x2); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 12), _mm_srli_epi32(x6, 20)); + x0 = _mm_add_epi32(x0, x1); + x3 = _mm_xor_si128(x3, x0); + x3 = _mm_shuffle_epi8(x3, x5); + x0 = _mm_shuffle_epi32(x0, 0x39); + x2 = _mm_add_epi32(x2, x3); + x3 = _mm_shuffle_epi32(x3, 0x4e); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_shuffle_epi32(x2, 0x93); + x6 = x1; + x1 = _mm_or_si128(_mm_slli_epi32(x1, 7), _mm_srli_epi32(x6, 25)); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_CHACHA_SSSE3) + #undef SCRYPT_MIX + #define SCRYPT_MIX "ChaCha/8-SSSE3" + #undef SCRYPT_CHACHA_INCLUDED + #define SCRYPT_CHACHA_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_chacha.h b/scryptjane/scrypt-jane-mix_chacha.h new file mode 100644 index 00000000..85ee9c1c --- /dev/null +++ b/scryptjane/scrypt-jane-mix_chacha.h @@ -0,0 +1,69 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_CHACHA_INCLUDED) + +#undef SCRYPT_MIX +#define SCRYPT_MIX "ChaCha20/8 Ref" + +#undef SCRYPT_CHACHA_INCLUDED +#define SCRYPT_CHACHA_INCLUDED +#define SCRYPT_CHACHA_BASIC + +static void +chacha_core_basic(uint32_t state[16]) { + size_t rounds = 8; + uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; + + x0 = state[0]; + x1 = state[1]; + x2 = state[2]; + x3 = state[3]; + x4 = state[4]; + x5 = state[5]; + x6 = state[6]; + x7 = state[7]; + x8 = state[8]; + x9 = state[9]; + x10 = state[10]; + x11 = state[11]; + x12 = state[12]; + x13 = state[13]; + x14 = state[14]; + x15 = state[15]; + + #define quarter(a,b,c,d) \ + a += b; t = d^a; d = ROTL32(t,16); \ + c += d; t = b^c; b = ROTL32(t,12); \ + a += b; t = d^a; d = ROTL32(t, 8); \ + c += d; t = b^c; b = ROTL32(t, 7); + + for (; rounds; rounds -= 2) { + quarter( x0, x4, x8,x12) + quarter( x1, x5, x9,x13) + quarter( x2, x6,x10,x14) + quarter( x3, x7,x11,x15) + quarter( x0, x5,x10,x15) + quarter( x1, x6,x11,x12) + quarter( x2, x7, x8,x13) + quarter( x3, x4, x9,x14) + } + + state[0] += x0; + state[1] += x1; + state[2] += x2; + state[3] += x3; + state[4] += x4; + state[5] += x5; + state[6] += x6; + state[7] += x7; + state[8] += x8; + state[9] += x9; + state[10] += x10; + state[11] += x11; + state[12] += x12; + state[13] += x13; + state[14] += x14; + state[15] += x15; + + #undef quarter +} + +#endif \ No newline at end of file diff --git a/scryptjane/scrypt-jane-mix_salsa-avx.h b/scryptjane/scrypt-jane-mix_salsa-avx.h new file mode 100644 index 00000000..15fb48e3 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_salsa-avx.h @@ -0,0 +1,381 @@ +/* x86 */ +#if defined(X86ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,32) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[ecx+eax+0]) + a3(vpxor xmm1,xmm1,[ecx+eax+16]) + a3(vpxor xmm2,xmm2,[ecx+eax+32]) + a3(vpxor xmm3,xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_avx_loop:) + a2(and eax, eax) + a3(vpxor xmm0,xmm0,[esi+ecx+0]) + a3(vpxor xmm1,xmm1,[esi+ecx+16]) + a3(vpxor xmm2,xmm2,[esi+ecx+32]) + a3(vpxor xmm3,xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[eax+ecx+0]) + a3(vpxor xmm1,xmm1,[eax+ecx+16]) + a3(vpxor xmm2,xmm2,[eax+ecx+32]) + a3(vpxor xmm3,xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa [esp+0],xmm0) + a2(vmovdqa [esp+16],xmm1) + a2(vmovdqa xmm6,xmm2) + a2(vmovdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_salsa_avx_loop: ) + a3(vpaddd xmm4, xmm1, xmm0) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm3, xmm3, xmm5) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm0, xmm3) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm3, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm1, xmm1, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm2, xmm1) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a2(sub eax, 2) + a3(vpaddd xmm4, xmm3, xmm0) + a3(pshufd xmm1, xmm1, 0x39) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm1, xmm1, xmm5) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm0, xmm1) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm1, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm3, xmm3, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm2, xmm3) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a1(ja scrypt_salsa_avx_loop) + a3(vpaddd xmm0,xmm0,[esp+0]) + a3(vpaddd xmm1,xmm1,[esp+16]) + a3(vpaddd xmm2,xmm2,xmm6) + a3(vpaddd xmm3,xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(vmovdqa [eax+0],xmm0) + a2(vmovdqa [eax+16],xmm1) + a2(vmovdqa [eax+32],xmm2) + a2(vmovdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_avx_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_AVX + +asm_naked_fn_proto(void, scrypt_ChunkMix_avx)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_avx) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(vmovdqa xmm0,[rax+0]) + a2(vmovdqa xmm1,[rax+16]) + a2(vmovdqa xmm2,[rax+32]) + a2(vmovdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_avx_no_xor1) + a3(vpxor xmm0,xmm0,[r9+0]) + a3(vpxor xmm1,xmm1,[r9+16]) + a3(vpxor xmm2,xmm2,[r9+32]) + a3(vpxor xmm3,xmm3,[r9+48]) + a1(scrypt_ChunkMix_avx_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_avx_loop:) + a2(and rdx, rdx) + a3(vpxor xmm0,xmm0,[rsi+r9+0]) + a3(vpxor xmm1,xmm1,[rsi+r9+16]) + a3(vpxor xmm2,xmm2,[rsi+r9+32]) + a3(vpxor xmm3,xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_avx_no_xor2) + a3(vpxor xmm0,xmm0,[rdx+r9+0]) + a3(vpxor xmm1,xmm1,[rdx+r9+16]) + a3(vpxor xmm2,xmm2,[rdx+r9+32]) + a3(vpxor xmm3,xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_avx_no_xor2:) + a2(vmovdqa xmm8,xmm0) + a2(vmovdqa xmm9,xmm1) + a2(vmovdqa xmm10,xmm2) + a2(vmovdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_salsa_avx_loop: ) + a3(vpaddd xmm4, xmm1, xmm0) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm3, xmm3, xmm5) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm0, xmm3) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm3, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm1, xmm1, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm2, xmm1) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a2(sub rax, 2) + a3(vpaddd xmm4, xmm3, xmm0) + a3(pshufd xmm1, xmm1, 0x39) + a3(vpsrld xmm5, xmm4, 25) + a3(vpslld xmm4, xmm4, 7) + a3(vpxor xmm1, xmm1, xmm5) + a3(vpxor xmm1, xmm1, xmm4) + a3(vpaddd xmm4, xmm0, xmm1) + a3(vpsrld xmm5, xmm4, 23) + a3(vpslld xmm4, xmm4, 9) + a3(vpxor xmm2, xmm2, xmm5) + a3(vpxor xmm2, xmm2, xmm4) + a3(vpaddd xmm4, xmm1, xmm2) + a3(vpsrld xmm5, xmm4, 19) + a3(vpslld xmm4, xmm4, 13) + a3(vpxor xmm3, xmm3, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a3(vpxor xmm3, xmm3, xmm4) + a3(vpaddd xmm4, xmm2, xmm3) + a3(vpsrld xmm5, xmm4, 14) + a3(vpslld xmm4, xmm4, 18) + a3(vpxor xmm0, xmm0, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a3(vpxor xmm0, xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a1(ja scrypt_salsa_avx_loop) + a3(vpaddd xmm0,xmm0,xmm8) + a3(vpaddd xmm1,xmm1,xmm9) + a3(vpaddd xmm2,xmm2,xmm10) + a3(vpaddd xmm3,xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(vmovdqa [rax+0],xmm0) + a2(vmovdqa [rax+16],xmm1) + a2(vmovdqa [rax+32],xmm2) + a2(vmovdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_avx_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_avx) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_AVX) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_AVX + +static void NOINLINE +scrypt_ChunkMix_avx(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x4 = x1; + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x3 = _mm_xor_si128(x3, x4); + x4 = x0; + x3 = _mm_xor_si128(x3, x5); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x3; + x2 = _mm_xor_si128(x2, x5); + x3 = _mm_shuffle_epi32(x3, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x1 = _mm_xor_si128(x1, x4); + x4 = x2; + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x4 = x3; + x0 = _mm_xor_si128(x0, x5); + x1 = _mm_shuffle_epi32(x1, 0x39); + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x1 = _mm_xor_si128(x1, x4); + x4 = x0; + x1 = _mm_xor_si128(x1, x5); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x1; + x2 = _mm_xor_si128(x2, x5); + x1 = _mm_shuffle_epi32(x1, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x3 = _mm_xor_si128(x3, x4); + x4 = x2; + x3 = _mm_xor_si128(x3, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x3 = _mm_shuffle_epi32(x3, 0x39); + x0 = _mm_xor_si128(x0, x5); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_SALSA_AVX) + /* uses salsa_core_tangle_sse2 */ + + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa/8-AVX" + #undef SCRYPT_SALSA_INCLUDED + #define SCRYPT_SALSA_INCLUDED +#endif diff --git a/scryptjane/scrypt-jane-mix_salsa-sse2.h b/scryptjane/scrypt-jane-mix_salsa-sse2.h new file mode 100644 index 00000000..4898659e --- /dev/null +++ b/scryptjane/scrypt-jane-mix_salsa-sse2.h @@ -0,0 +1,443 @@ +/* x86 */ +#if defined(X86ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a1(push ebx) + a1(push edi) + a1(push esi) + a1(push ebp) + a2(mov ebp,esp) + a2(mov edi,[ebp+20]) + a2(mov esi,[ebp+24]) + a2(mov eax,[ebp+28]) + a2(mov ebx,[ebp+32]) + a2(sub esp,32) + a2(and esp,~63) + a2(lea edx,[ebx*2]) + a2(shl edx,6) + a2(lea ecx,[edx-64]) + a2(and eax, eax) + a2(movdqa xmm0,[ecx+esi+0]) + a2(movdqa xmm1,[ecx+esi+16]) + a2(movdqa xmm2,[ecx+esi+32]) + a2(movdqa xmm3,[ecx+esi+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[ecx+eax+0]) + a2(pxor xmm1,[ecx+eax+16]) + a2(pxor xmm2,[ecx+eax+32]) + a2(pxor xmm3,[ecx+eax+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor ecx,ecx) + a2(xor ebx,ebx) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and eax, eax) + a2(pxor xmm0,[esi+ecx+0]) + a2(pxor xmm1,[esi+ecx+16]) + a2(pxor xmm2,[esi+ecx+32]) + a2(pxor xmm3,[esi+ecx+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[eax+ecx+0]) + a2(pxor xmm1,[eax+ecx+16]) + a2(pxor xmm2,[eax+ecx+32]) + a2(pxor xmm3,[eax+ecx+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa [esp+0],xmm0) + a2(movdqa [esp+16],xmm1) + a2(movdqa xmm6,xmm2) + a2(movdqa xmm7,xmm3) + a2(mov eax,8) + a1(scrypt_salsa_sse2_loop: ) + a2(movdqa xmm4, xmm1) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm3, xmm5) + a2(paddd xmm4, xmm3) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm2, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm1, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm0, xmm5) + a3(pshufd xmm1, xmm1, 0x39) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm1, xmm5) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm1) + a2(pxor xmm2, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm3, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm3) + a2(sub eax, 2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a2(pxor xmm0, xmm5) + a1(ja scrypt_salsa_sse2_loop) + a2(paddd xmm0,[esp+0]) + a2(paddd xmm1,[esp+16]) + a2(paddd xmm2,xmm6) + a2(paddd xmm3,xmm7) + a2(lea eax,[ebx+ecx]) + a2(xor ebx,edx) + a2(and eax,~0x7f) + a2(add ecx,64) + a2(shr eax,1) + a2(add eax, edi) + a2(cmp ecx,edx) + a2(movdqa [eax+0],xmm0) + a2(movdqa [eax+16],xmm1) + a2(movdqa [eax+32],xmm2) + a2(movdqa [eax+48],xmm3) + a2(mov eax,[ebp+28]) + a1(jne scrypt_ChunkMix_sse2_loop) + a2(mov esp,ebp) + a1(pop ebp) + a1(pop esi) + a1(pop edi) + a1(pop ebx) + a1(ret 16) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + + +/* x64 */ +#if defined(X86_64ASM_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_SSE2 + +asm_naked_fn_proto(void, scrypt_ChunkMix_sse2)(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) +asm_naked_fn(scrypt_ChunkMix_sse2) + a2(lea rcx,[rcx*2]) + a2(shl rcx,6) + a2(lea r9,[rcx-64]) + a2(lea rax,[rsi+r9]) + a2(lea r9,[rdx+r9]) + a2(and rdx, rdx) + a2(movdqa xmm0,[rax+0]) + a2(movdqa xmm1,[rax+16]) + a2(movdqa xmm2,[rax+32]) + a2(movdqa xmm3,[rax+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor1) + a2(pxor xmm0,[r9+0]) + a2(pxor xmm1,[r9+16]) + a2(pxor xmm2,[r9+32]) + a2(pxor xmm3,[r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor1:) + a2(xor r9,r9) + a2(xor r8,r8) + a1(scrypt_ChunkMix_sse2_loop:) + a2(and rdx, rdx) + a2(pxor xmm0,[rsi+r9+0]) + a2(pxor xmm1,[rsi+r9+16]) + a2(pxor xmm2,[rsi+r9+32]) + a2(pxor xmm3,[rsi+r9+48]) + a1(jz scrypt_ChunkMix_sse2_no_xor2) + a2(pxor xmm0,[rdx+r9+0]) + a2(pxor xmm1,[rdx+r9+16]) + a2(pxor xmm2,[rdx+r9+32]) + a2(pxor xmm3,[rdx+r9+48]) + a1(scrypt_ChunkMix_sse2_no_xor2:) + a2(movdqa xmm8,xmm0) + a2(movdqa xmm9,xmm1) + a2(movdqa xmm10,xmm2) + a2(movdqa xmm11,xmm3) + a2(mov rax,8) + a1(scrypt_salsa_sse2_loop: ) + a2(movdqa xmm4, xmm1) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm3, xmm5) + a2(paddd xmm4, xmm3) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm2, xmm5) + a3(pshufd xmm3, xmm3, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm1, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a2(movdqa xmm4, xmm3) + a2(pxor xmm0, xmm5) + a3(pshufd xmm1, xmm1, 0x39) + a2(paddd xmm4, xmm0) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 7) + a2(psrld xmm5, 25) + a2(pxor xmm1, xmm4) + a2(movdqa xmm4, xmm0) + a2(pxor xmm1, xmm5) + a2(paddd xmm4, xmm1) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 9) + a2(psrld xmm5, 23) + a2(pxor xmm2, xmm4) + a2(movdqa xmm4, xmm1) + a2(pxor xmm2, xmm5) + a3(pshufd xmm1, xmm1, 0x93) + a2(paddd xmm4, xmm2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 13) + a2(psrld xmm5, 19) + a2(pxor xmm3, xmm4) + a2(movdqa xmm4, xmm2) + a2(pxor xmm3, xmm5) + a3(pshufd xmm2, xmm2, 0x4e) + a2(paddd xmm4, xmm3) + a2(sub rax, 2) + a2(movdqa xmm5, xmm4) + a2(pslld xmm4, 18) + a2(psrld xmm5, 14) + a2(pxor xmm0, xmm4) + a3(pshufd xmm3, xmm3, 0x39) + a2(pxor xmm0, xmm5) + a1(ja scrypt_salsa_sse2_loop) + a2(paddd xmm0,xmm8) + a2(paddd xmm1,xmm9) + a2(paddd xmm2,xmm10) + a2(paddd xmm3,xmm11) + a2(lea rax,[r8+r9]) + a2(xor r8,rcx) + a2(and rax,~0x7f) + a2(add r9,64) + a2(shr rax,1) + a2(add rax, rdi) + a2(cmp r9,rcx) + a2(movdqa [rax+0],xmm0) + a2(movdqa [rax+16],xmm1) + a2(movdqa [rax+32],xmm2) + a2(movdqa [rax+48],xmm3) + a1(jne scrypt_ChunkMix_sse2_loop) + a1(ret) +asm_naked_fn_end(scrypt_ChunkMix_sse2) + +#endif + + +/* intrinsic */ +#if defined(X86_INTRINSIC_SSE2) && (!defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED)) + +#define SCRYPT_SALSA_SSE2 + +static void NOINLINE +scrypt_ChunkMix_sse2(uint32_t *Bout/*[chunkBytes]*/, uint32_t *Bin/*[chunkBytes]*/, uint32_t *Bxor/*[chunkBytes]*/, uint32_t r) { + uint32_t i, blocksPerChunk = r * 2, half = 0; + xmmi *xmmp,x0,x1,x2,x3,x4,x5,t0,t1,t2,t3; + size_t rounds; + + /* 1: X = B_{2r - 1} */ + xmmp = (xmmi *)scrypt_block(Bin, blocksPerChunk - 1); + x0 = xmmp[0]; + x1 = xmmp[1]; + x2 = xmmp[2]; + x3 = xmmp[3]; + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, blocksPerChunk - 1); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + xmmp = (xmmi *)scrypt_block(Bin, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + + if (Bxor) { + xmmp = (xmmi *)scrypt_block(Bxor, i); + x0 = _mm_xor_si128(x0, xmmp[0]); + x1 = _mm_xor_si128(x1, xmmp[1]); + x2 = _mm_xor_si128(x2, xmmp[2]); + x3 = _mm_xor_si128(x3, xmmp[3]); + } + + t0 = x0; + t1 = x1; + t2 = x2; + t3 = x3; + + for (rounds = 8; rounds; rounds -= 2) { + x4 = x1; + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x3 = _mm_xor_si128(x3, x4); + x4 = x0; + x3 = _mm_xor_si128(x3, x5); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x3; + x2 = _mm_xor_si128(x2, x5); + x3 = _mm_shuffle_epi32(x3, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x1 = _mm_xor_si128(x1, x4); + x4 = x2; + x1 = _mm_xor_si128(x1, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x4 = x3; + x0 = _mm_xor_si128(x0, x5); + x1 = _mm_shuffle_epi32(x1, 0x39); + x4 = _mm_add_epi32(x4, x0); + x5 = x4; + x4 = _mm_slli_epi32(x4, 7); + x5 = _mm_srli_epi32(x5, 25); + x1 = _mm_xor_si128(x1, x4); + x4 = x0; + x1 = _mm_xor_si128(x1, x5); + x4 = _mm_add_epi32(x4, x1); + x5 = x4; + x4 = _mm_slli_epi32(x4, 9); + x5 = _mm_srli_epi32(x5, 23); + x2 = _mm_xor_si128(x2, x4); + x4 = x1; + x2 = _mm_xor_si128(x2, x5); + x1 = _mm_shuffle_epi32(x1, 0x93); + x4 = _mm_add_epi32(x4, x2); + x5 = x4; + x4 = _mm_slli_epi32(x4, 13); + x5 = _mm_srli_epi32(x5, 19); + x3 = _mm_xor_si128(x3, x4); + x4 = x2; + x3 = _mm_xor_si128(x3, x5); + x2 = _mm_shuffle_epi32(x2, 0x4e); + x4 = _mm_add_epi32(x4, x3); + x5 = x4; + x4 = _mm_slli_epi32(x4, 18); + x5 = _mm_srli_epi32(x5, 14); + x0 = _mm_xor_si128(x0, x4); + x3 = _mm_shuffle_epi32(x3, 0x39); + x0 = _mm_xor_si128(x0, x5); + } + + x0 = _mm_add_epi32(x0, t0); + x1 = _mm_add_epi32(x1, t1); + x2 = _mm_add_epi32(x2, t2); + x3 = _mm_add_epi32(x3, t3); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + xmmp = (xmmi *)scrypt_block(Bout, (i / 2) + half); + xmmp[0] = x0; + xmmp[1] = x1; + xmmp[2] = x2; + xmmp[3] = x3; + } +} + +#endif + +#if defined(SCRYPT_SALSA_SSE2) + #undef SCRYPT_MIX + #define SCRYPT_MIX "Salsa/8-SSE2" + #undef SCRYPT_SALSA_INCLUDED + #define SCRYPT_SALSA_INCLUDED +#endif + +/* used by avx,etc as well */ +#if defined(SCRYPT_SALSA_INCLUDED) + /* + Default layout: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 + 12 13 14 15 + + SSE2 layout: + 0 5 10 15 + 12 1 6 11 + 8 13 2 7 + 4 9 14 3 + */ + + static void STDCALL + salsa_core_tangle_sse2(uint32_t *blocks, size_t count) { + uint32_t t; + while (count--) { + t = blocks[1]; blocks[1] = blocks[5]; blocks[5] = t; + t = blocks[2]; blocks[2] = blocks[10]; blocks[10] = t; + t = blocks[3]; blocks[3] = blocks[15]; blocks[15] = t; + t = blocks[4]; blocks[4] = blocks[12]; blocks[12] = t; + t = blocks[7]; blocks[7] = blocks[11]; blocks[11] = t; + t = blocks[9]; blocks[9] = blocks[13]; blocks[13] = t; + blocks += 16; + } + } +#endif + diff --git a/scryptjane/scrypt-jane-mix_salsa.h b/scryptjane/scrypt-jane-mix_salsa.h new file mode 100644 index 00000000..33f33409 --- /dev/null +++ b/scryptjane/scrypt-jane-mix_salsa.h @@ -0,0 +1,70 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_SALSA_INCLUDED) + +#undef SCRYPT_MIX +#define SCRYPT_MIX "Salsa20/8 Ref" + +#undef SCRYPT_SALSA_INCLUDED +#define SCRYPT_SALSA_INCLUDED +#define SCRYPT_SALSA_BASIC + +static void +salsa_core_basic(uint32_t state[16]) { + size_t rounds = 8; + uint32_t x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,t; + + x0 = state[0]; + x1 = state[1]; + x2 = state[2]; + x3 = state[3]; + x4 = state[4]; + x5 = state[5]; + x6 = state[6]; + x7 = state[7]; + x8 = state[8]; + x9 = state[9]; + x10 = state[10]; + x11 = state[11]; + x12 = state[12]; + x13 = state[13]; + x14 = state[14]; + x15 = state[15]; + + #define quarter(a,b,c,d) \ + t = a+d; t = ROTL32(t, 7); b ^= t; \ + t = b+a; t = ROTL32(t, 9); c ^= t; \ + t = c+b; t = ROTL32(t, 13); d ^= t; \ + t = d+c; t = ROTL32(t, 18); a ^= t; \ + + for (; rounds; rounds -= 2) { + quarter( x0, x4, x8,x12) + quarter( x5, x9,x13, x1) + quarter(x10,x14, x2, x6) + quarter(x15, x3, x7,x11) + quarter( x0, x1, x2, x3) + quarter( x5, x6, x7, x4) + quarter(x10,x11, x8, x9) + quarter(x15,x12,x13,x14) + } + + state[0] += x0; + state[1] += x1; + state[2] += x2; + state[3] += x3; + state[4] += x4; + state[5] += x5; + state[6] += x6; + state[7] += x7; + state[8] += x8; + state[9] += x9; + state[10] += x10; + state[11] += x11; + state[12] += x12; + state[13] += x13; + state[14] += x14; + state[15] += x15; + + #undef quarter +} + +#endif + diff --git a/scryptjane/scrypt-jane-pbkdf2.h b/scryptjane/scrypt-jane-pbkdf2.h new file mode 100644 index 00000000..711e3d63 --- /dev/null +++ b/scryptjane/scrypt-jane-pbkdf2.h @@ -0,0 +1,112 @@ +typedef struct scrypt_hmac_state_t { + scrypt_hash_state inner, outer; +} scrypt_hmac_state; + + +static void +scrypt_hash(scrypt_hash_digest hash, const uint8_t *m, size_t mlen) { + scrypt_hash_state st; + scrypt_hash_init(&st); + scrypt_hash_update(&st, m, mlen); + scrypt_hash_finish(&st, hash); +} + +/* hmac */ +static void +scrypt_hmac_init(scrypt_hmac_state *st, const uint8_t *key, size_t keylen) { + uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0}; + size_t i; + + scrypt_hash_init(&st->inner); + scrypt_hash_init(&st->outer); + + if (keylen <= SCRYPT_HASH_BLOCK_SIZE) { + /* use the key directly if it's <= blocksize bytes */ + memcpy(pad, key, keylen); + } else { + /* if it's > blocksize bytes, hash it */ + scrypt_hash(pad, key, keylen); + } + + /* inner = (key ^ 0x36) */ + /* h(inner || ...) */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= 0x36; + scrypt_hash_update(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE); + + /* outer = (key ^ 0x5c) */ + /* h(outer || ...) */ + for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++) + pad[i] ^= (0x5c ^ 0x36); + scrypt_hash_update(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE); + + scrypt_ensure_zero(pad, sizeof(pad)); +} + +static void +scrypt_hmac_update(scrypt_hmac_state *st, const uint8_t *m, size_t mlen) { + /* h(inner || m...) */ + scrypt_hash_update(&st->inner, m, mlen); +} + +static void +scrypt_hmac_finish(scrypt_hmac_state *st, scrypt_hash_digest mac) { + /* h(inner || m) */ + scrypt_hash_digest innerhash; + scrypt_hash_finish(&st->inner, innerhash); + + /* h(outer || h(inner || m)) */ + scrypt_hash_update(&st->outer, innerhash, sizeof(innerhash)); + scrypt_hash_finish(&st->outer, mac); + + scrypt_ensure_zero(st, sizeof(*st)); +} + +static void +scrypt_pbkdf2(const uint8_t *password, size_t password_len, const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *out, size_t bytes) { + scrypt_hmac_state hmac_pw, hmac_pw_salt, work; + scrypt_hash_digest ti, u; + uint8_t be[4]; + uint32_t i, j, blocks; + uint64_t c; + + /* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */ + + /* hmac(password, ...) */ + scrypt_hmac_init(&hmac_pw, password, password_len); + + /* hmac(password, salt...) */ + hmac_pw_salt = hmac_pw; + scrypt_hmac_update(&hmac_pw_salt, salt, salt_len); + + blocks = ((uint32_t)bytes + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE; + for (i = 1; i <= blocks; i++) { + /* U1 = hmac(password, salt || be(i)) */ + U32TO8_BE(be, i); + work = hmac_pw_salt; + scrypt_hmac_update(&work, be, 4); + scrypt_hmac_finish(&work, ti); + memcpy(u, ti, sizeof(u)); + + /* T[i] = U1 ^ U2 ^ U3... */ + for (c = 0; c < N - 1; c++) { + /* UX = hmac(password, U{X-1}) */ + work = hmac_pw; + scrypt_hmac_update(&work, u, SCRYPT_HASH_DIGEST_SIZE); + scrypt_hmac_finish(&work, u); + + /* T[i] ^= UX */ + for (j = 0; j < sizeof(u); j++) + ti[j] ^= u[j]; + } + + memcpy(out, ti, (bytes > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : bytes); + out += SCRYPT_HASH_DIGEST_SIZE; + bytes -= SCRYPT_HASH_DIGEST_SIZE; + } + + scrypt_ensure_zero(ti, sizeof(ti)); + scrypt_ensure_zero(u, sizeof(u)); + scrypt_ensure_zero(&hmac_pw, sizeof(hmac_pw)); + scrypt_ensure_zero(&hmac_pw_salt, sizeof(hmac_pw_salt)); +} diff --git a/scryptjane/scrypt-jane-portable-x86.h b/scryptjane/scrypt-jane-portable-x86.h new file mode 100644 index 00000000..03282fa8 --- /dev/null +++ b/scryptjane/scrypt-jane-portable-x86.h @@ -0,0 +1,364 @@ +#if defined(CPU_X86) && (defined(COMPILER_MSVC) || defined(COMPILER_GCC)) + #define X86ASM + /* gcc 2.95 royally screws up stack alignments on variables */ + #if (defined(COMPILER_MSVC6PP_AND_LATER) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 30000))) + #define X86ASM_SSE + #define X86ASM_SSE2 + #endif + #if ((defined(COMPILER_MSVC) && (COMPILER_MSVC >= 1400)) || (defined(COMPILER_GCC) && (COMPILER_GCC >= 40102))) + #define X86ASM_SSSE3 + #endif + #if ((defined(COMPILER_GCC) && (COMPILER_GCC >= 40400))) + #define X86ASM_AVX + #endif +#endif + +#if defined(CPU_X86_64) && defined(COMPILER_GCC) + #define X86_64ASM + #define X86_64ASM_SSE2 + #if (COMPILER_GCC >= 40102) + #define X86_64ASM_SSSE3 + #endif + #if (COMPILER_GCC >= 40400) + #define X86_64ASM_AVX + #endif +#endif + +#if defined(COMPILER_MSVC) + #define X86_INTRINSIC + #if defined(CPU_X86_64) || defined(X86ASM_SSE) + #define X86_INTRINSIC_SSE + #endif + #if defined(CPU_X86_64) || defined(X86ASM_SSE2) + #define X86_INTRINSIC_SSE2 + #endif + #if (COMPILER_MSVC >= 1400) + #define X86_INTRINSIC_SSSE3 + #endif +#endif + +#if defined(COMPILER_MSVC) && defined(CPU_X86_64) + #define X86_64USE_INTRINSIC +#endif + +#if defined(COMPILER_MSVC) && defined(CPU_X86_64) + #define X86_64USE_INTRINSIC +#endif + +#if defined(COMPILER_GCC) && defined(CPU_X86_FORCE_INTRINSICS) + #define X86_INTRINSIC + #if defined(__SSE__) + #define X86_INTRINSIC_SSE + #endif + #if defined(__SSE2__) + #define X86_INTRINSIC_SSE2 + #endif + #if defined(__SSSE3__) + #define X86_INTRINSIC_SSSE3 + #endif + #if defined(__AVX__) + #define X86_INTRINSIC_AVX + #endif +#endif + +/* only use simd on windows (or SSE2 on gcc)! */ +#if defined(CPU_X86_FORCE_INTRINSICS) || defined(X86_INTRINSIC) + #if defined(X86_INTRINSIC_SSE) + #define X86_INTRINSIC + #include + #include + typedef __m64 qmm; + typedef __m128 xmm; + typedef __m128d xmmd; + #endif + #if defined(X86_INTRINSIC_SSE2) + #define X86_INTRINSIC_SSE2 + #include + typedef __m128i xmmi; + #endif + #if defined(X86_INTRINSIC_SSSE3) + #define X86_INTRINSIC_SSSE3 + #include + #endif +#endif + + +#if defined(X86_INTRINSIC_SSE2) + typedef union packedelem8_t { + uint8_t u[16]; + xmmi v; + } packedelem8; + + typedef union packedelem32_t { + uint32_t u[4]; + xmmi v; + } packedelem32; + + typedef union packedelem64_t { + uint64_t u[2]; + xmmi v; + } packedelem64; +#else + typedef union packedelem8_t { + uint8_t u[16]; + uint32_t dw[4]; + } packedelem8; + + typedef union packedelem32_t { + uint32_t u[4]; + uint8_t b[16]; + } packedelem32; + + typedef union packedelem64_t { + uint64_t u[2]; + uint8_t b[16]; + } packedelem64; +#endif + +#if defined(X86_INTRINSIC_SSSE3) || defined(X86ASM_SSSE3) || defined(X86_64ASM_SSSE3) + const packedelem8 MM16 ssse3_rotr16_64bit = {{2,3,4,5,6,7,0,1,10,11,12,13,14,15,8,9}}; + const packedelem8 MM16 ssse3_rotl16_32bit = {{2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13}}; + const packedelem8 MM16 ssse3_rotl8_32bit = {{3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14}}; + const packedelem8 MM16 ssse3_endian_swap_64bit = {{7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8}}; +#endif + +/* + x86 inline asm for gcc/msvc. usage: + + asm_naked_fn_proto(return_type, name) (type parm1, type parm2..) + asm_naked_fn(name) + a1(..) + a2(.., ..) + a3(.., .., ..) + a1(ret) + asm_naked_fn_end(name) +*/ + +#if defined(X86ASM) || defined(X86_64ASM) + +#if defined(COMPILER_MSVC) + #pragma warning(disable : 4731) /* frame pointer modified by inline assembly */ + #define a1(x) __asm {x} + #define a2(x, y) __asm {x, y} + #define a3(x, y, z) __asm {x, y, z} + #define a4(x, y, z, w) __asm {x, y, z, w} + #define al(x) __asm {label##x:} + #define aj(x, y, z) __asm {x label##y} + #define asm_align8 a1(ALIGN 8) + #define asm_align16 a1(ALIGN 16) + + #define asm_naked_fn_proto(type, fn) static NAKED type STDCALL fn + #define asm_naked_fn(fn) { + #define asm_naked_fn_end(fn) } +#elif defined(COMPILER_GCC) + #define GNU_AS1(x) #x ";\n" + #define GNU_AS2(x, y) #x ", " #y ";\n" + #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";\n" + #define GNU_AS4(x, y, z, w) #x ", " #y ", " #z ", " #w ";\n" + #define GNU_ASL(x) "\n" #x ":\n" + #define GNU_ASJ(x, y, z) #x " " #y #z ";" + + #define a1(x) GNU_AS1(x) + #define a2(x, y) GNU_AS2(x, y) + #define a3(x, y, z) GNU_AS3(x, y, z) + #define a4(x, y, z, w) GNU_AS4(x, y, z, w) + #define al(x) GNU_ASL(x) + #define aj(x, y, z) GNU_ASJ(x, y, z) + #define asm_align8 a1(.align 8) + #define asm_align16 a1(.align 16) + + #define asm_naked_fn_proto(type, fn) extern type STDCALL fn + #define asm_naked_fn(fn) ; __asm__ (".intel_syntax noprefix;\n.text\n" asm_align16 GNU_ASL(fn) + #define asm_naked_fn_end(fn) ".att_syntax prefix;\n.type " #fn ",@function\n.size " #fn ",.-" #fn "\n" ); + #define asm_gcc() __asm__ __volatile__(".intel_syntax noprefix;\n" + #define asm_gcc_parms() ".att_syntax prefix;" + #define asm_gcc_trashed() __asm__ __volatile__("" ::: + #define asm_gcc_end() ); +#else + need x86 asm +#endif + +#endif /* X86ASM || X86_64ASM */ + + +#if defined(CPU_X86) || defined(CPU_X86_64) + +typedef enum cpu_flags_x86_t { + cpu_mmx = 1 << 0, + cpu_sse = 1 << 1, + cpu_sse2 = 1 << 2, + cpu_sse3 = 1 << 3, + cpu_ssse3 = 1 << 4, + cpu_sse4_1 = 1 << 5, + cpu_sse4_2 = 1 << 6, + cpu_avx = 1 << 7 +} cpu_flags_x86; + +typedef enum cpu_vendors_x86_t { + cpu_nobody, + cpu_intel, + cpu_amd +} cpu_vendors_x86; + +typedef struct x86_regs_t { + uint32_t eax, ebx, ecx, edx; +} x86_regs; + +#if defined(X86ASM) +asm_naked_fn_proto(int, has_cpuid)(void) +asm_naked_fn(has_cpuid) + a1(pushfd) + a1(pop eax) + a2(mov ecx, eax) + a2(xor eax, 0x200000) + a1(push eax) + a1(popfd) + a1(pushfd) + a1(pop eax) + a2(xor eax, ecx) + a2(shr eax, 21) + a2(and eax, 1) + a1(push ecx) + a1(popfd) + a1(ret) +asm_naked_fn_end(has_cpuid) +#endif /* X86ASM */ + + +static void NOINLINE +get_cpuid(x86_regs *regs, uint32_t flags) { +#if defined(COMPILER_MSVC) + __cpuid((int *)regs, (int)flags); +#else + #if defined(CPU_X86_64) + #define cpuid_bx rbx + #else + #define cpuid_bx ebx + #endif + + asm_gcc() + a1(push cpuid_bx) + a1(cpuid) + a2(mov [%1 + 0], eax) + a2(mov [%1 + 4], ebx) + a2(mov [%1 + 8], ecx) + a2(mov [%1 + 12], edx) + a1(pop cpuid_bx) + asm_gcc_parms() : "+a"(flags) : "S"(regs) : "%ecx", "%edx", "cc" + asm_gcc_end() +#endif +} + +#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) +static uint64_t NOINLINE +get_xgetbv(uint32_t flags) { +#if defined(COMPILER_MSVC) + return _xgetbv(flags); +#else + uint32_t lo, hi; + asm_gcc() + a1(xgetbv) + asm_gcc_parms() : "+c"(flags), "=a" (lo), "=d" (hi) + asm_gcc_end() + return ((uint64_t)lo | ((uint64_t)hi << 32)); +#endif +} +#endif // AVX support + +#if defined(SCRYPT_TEST_SPEED) +size_t cpu_detect_mask = (size_t)-1; +#endif + +static size_t +detect_cpu(void) { + union { uint8_t s[12]; uint32_t i[3]; } vendor_string; + cpu_vendors_x86 vendor = cpu_nobody; + x86_regs regs; + uint32_t max_level; + size_t cpu_flags = 0; +#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) + uint64_t xgetbv_flags; +#endif + +#if defined(CPU_X86) + if (!has_cpuid()) + return cpu_flags; +#endif + + get_cpuid(®s, 0); + max_level = regs.eax; + vendor_string.i[0] = regs.ebx; + vendor_string.i[1] = regs.edx; + vendor_string.i[2] = regs.ecx; + + if (scrypt_verify(vendor_string.s, (const uint8_t *)"GenuineIntel", 12)) + vendor = cpu_intel; + else if (scrypt_verify(vendor_string.s, (const uint8_t *)"AuthenticAMD", 12)) + vendor = cpu_amd; + + if (max_level & 0x00000500) { + /* "Intel P5 pre-B0" */ + cpu_flags |= cpu_mmx; + return cpu_flags; + } + + if (max_level < 1) + return cpu_flags; + + get_cpuid(®s, 1); +#if defined(X86ASM_AVX) || defined(X86_64ASM_AVX) + /* xsave/xrestore */ + if (regs.ecx & (1 << 27)) { + xgetbv_flags = get_xgetbv(0); + if ((regs.ecx & (1 << 28)) && (xgetbv_flags & 0x6)) cpu_flags |= cpu_avx; + } +#endif + if (regs.ecx & (1 << 20)) cpu_flags |= cpu_sse4_2; + if (regs.ecx & (1 << 19)) cpu_flags |= cpu_sse4_2; + if (regs.ecx & (1 << 9)) cpu_flags |= cpu_ssse3; + if (regs.ecx & (1 )) cpu_flags |= cpu_sse3; + if (regs.edx & (1 << 26)) cpu_flags |= cpu_sse2; + if (regs.edx & (1 << 25)) cpu_flags |= cpu_sse; + if (regs.edx & (1 << 23)) cpu_flags |= cpu_mmx; + +#if defined(SCRYPT_TEST_SPEED) + cpu_flags &= cpu_detect_mask; +#endif + + return cpu_flags; +} + +#if defined(SCRYPT_TEST_SPEED) +static const char * +get_top_cpuflag_desc(size_t flag) { + if (flag & cpu_avx) return "AVX"; + else if (flag & cpu_sse4_2) return "SSE4.2"; + else if (flag & cpu_sse4_1) return "SSE4.1"; + else if (flag & cpu_ssse3) return "SSSE3"; + else if (flag & cpu_sse2) return "SSE2"; + else if (flag & cpu_sse) return "SSE"; + else if (flag & cpu_mmx) return "MMX"; + else return "Basic"; +} +#endif + +/* enable the highest system-wide option */ +#if defined(SCRYPT_CHOOSE_COMPILETIME) + #if !defined(__AVX__) + #undef X86_64ASM_AVX + #undef X86ASM_AVX + #undef X86_INTRINSIC_AVX + #endif + #if !defined(__SSSE3__) + #undef X86_64ASM_SSSE3 + #undef X86ASM_SSSE3 + #undef X86_INTRINSIC_SSSE3 + #endif + #if !defined(__SSE2__) + #undef X86_64ASM_SSE2 + #undef X86ASM_SSE2 + #undef X86_INTRINSIC_SSE2 + #endif +#endif + +#endif /* defined(CPU_X86) || defined(CPU_X86_64) */ \ No newline at end of file diff --git a/scryptjane/scrypt-jane-portable.h b/scryptjane/scrypt-jane-portable.h new file mode 100644 index 00000000..33c8c2ca --- /dev/null +++ b/scryptjane/scrypt-jane-portable.h @@ -0,0 +1,281 @@ +/* determine os */ +#if defined(_WIN32) || defined(_WIN64) || defined(__TOS_WIN__) || defined(__WINDOWS__) + #include + #include + #define OS_WINDOWS +#elif defined(sun) || defined(__sun) || defined(__SVR4) || defined(__svr4__) + #include + #include + #include + + #define OS_SOLARIS +#else + #include + #include + #include /* need this to define BSD */ + #include + #include + + #define OS_NIX + #if defined(__linux__) + #include + #define OS_LINUX + #elif defined(BSD) + #define OS_BSD + + #if defined(MACOS_X) || (defined(__APPLE__) & defined(__MACH__)) + #define OS_OSX + #elif defined(macintosh) || defined(Macintosh) + #define OS_MAC + #elif defined(__OpenBSD__) + #define OS_OPENBSD + #endif + #endif +#endif + + +/* determine compiler */ +#if defined(_MSC_VER) + #define COMPILER_MSVC _MSC_VER + #if ((COMPILER_MSVC > 1200) || defined(_mm_free)) + #define COMPILER_MSVC6PP_AND_LATER + #endif + #if (COMPILER_MSVC >= 1500) + #define COMPILER_HAS_TMMINTRIN + #endif + + #pragma warning(disable : 4127) /* conditional expression is constant */ + #pragma warning(disable : 4100) /* unreferenced formal parameter */ + + #define _CRT_SECURE_NO_WARNINGS + #include + #include /* _rotl */ + #include + + typedef unsigned char uint8_t; + typedef unsigned short uint16_t; + typedef unsigned int uint32_t; + typedef signed int int32_t; + typedef unsigned __int64 uint64_t; + typedef signed __int64 int64_t; + + #define ROTL32(a,b) _rotl(a,b) + #define ROTR32(a,b) _rotr(a,b) + #define ROTL64(a,b) _rotl64(a,b) + #define ROTR64(a,b) _rotr64(a,b) + #undef NOINLINE + #define NOINLINE __declspec(noinline) + #undef INLINE + #define INLINE __forceinline + #undef FASTCALL + #define FASTCALL __fastcall + #undef CDECL + #define CDECL __cdecl + #undef STDCALL + #define STDCALL __stdcall + #undef NAKED + #define NAKED __declspec(naked) + #define MM16 __declspec(align(16)) +#endif +#if defined(__ICC) + #define COMPILER_INTEL +#endif +#if defined(__GNUC__) + #if (__GNUC__ >= 3) + #define COMPILER_GCC_PATCHLEVEL __GNUC_PATCHLEVEL__ + #else + #define COMPILER_GCC_PATCHLEVEL 0 + #endif + #define COMPILER_GCC (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + COMPILER_GCC_PATCHLEVEL) + #define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b))) + #define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) + #define ROTL64(a,b) (((a) << (b)) | ((a) >> (64 - b))) + #define ROTR64(a,b) (((a) >> (b)) | ((a) << (64 - b))) + #undef NOINLINE + #if (COMPILER_GCC >= 30000) + #define NOINLINE __attribute__((noinline)) + #else + #define NOINLINE + #endif + #undef INLINE + #if (COMPILER_GCC >= 30000) + #define INLINE __attribute__((always_inline)) + #else + #define INLINE inline + #endif + #undef FASTCALL + #if (COMPILER_GCC >= 30400) + #define FASTCALL __attribute__((fastcall)) + #else + #define FASTCALL + #endif + #undef CDECL + #define CDECL __attribute__((cdecl)) + #undef STDCALL + #define STDCALL __attribute__((stdcall)) + #define MM16 __attribute__((aligned(16))) + #include +#endif +#if defined(__MINGW32__) || defined(__MINGW64__) + #define COMPILER_MINGW +#endif +#if defined(__PATHCC__) + #define COMPILER_PATHCC +#endif + +#define OPTIONAL_INLINE +#if defined(OPTIONAL_INLINE) + #undef OPTIONAL_INLINE + #define OPTIONAL_INLINE INLINE +#else + #define OPTIONAL_INLINE +#endif + +#define CRYPTO_FN NOINLINE STDCALL + +/* determine cpu */ +#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__ ) || defined(_M_X64) + #define CPU_X86_64 +#elif defined(__i586__) || defined(__i686__) || (defined(_M_IX86) && (_M_IX86 >= 500)) + #define CPU_X86 500 +#elif defined(__i486__) || (defined(_M_IX86) && (_M_IX86 >= 400)) + #define CPU_X86 400 +#elif defined(__i386__) || (defined(_M_IX86) && (_M_IX86 >= 300)) || defined(__X86__) || defined(_X86_) || defined(__I86__) + #define CPU_X86 300 +#elif defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(_M_IA64) || defined(__ia64) + #define CPU_IA64 +#endif + +#if defined(__sparc__) || defined(__sparc) || defined(__sparcv9) + #define CPU_SPARC + #if defined(__sparcv9) + #define CPU_SPARC64 + #endif +#endif + +#if defined(CPU_X86_64) || defined(CPU_IA64) || defined(CPU_SPARC64) || defined(__64BIT__) || defined(__LP64__) || defined(_LP64) || (defined(_MIPS_SZLONG) && (_MIPS_SZLONG == 64)) + #define CPU_64BITS + #undef FASTCALL + #define FASTCALL + #undef CDECL + #define CDECL + #undef STDCALL + #define STDCALL +#endif + +#if defined(powerpc) || defined(__PPC__) || defined(__ppc__) || defined(_ARCH_PPC) || defined(__powerpc__) || defined(__powerpc) || defined(POWERPC) || defined(_M_PPC) + #define CPU_PPC + #if defined(_ARCH_PWR7) + #define CPU_POWER7 + #elif defined(__64BIT__) + #define CPU_PPC64 + #else + #define CPU_PPC32 + #endif +#endif + +#if defined(__hppa__) || defined(__hppa) + #define CPU_HPPA +#endif + +#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) + #define CPU_ALPHA +#endif + +/* endian */ + +#if ((defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN)) || \ + (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && (BYTE_ORDER == LITTLE_ENDIAN)) || \ + (defined(CPU_X86) || defined(CPU_X86_64)) || \ + (defined(vax) || defined(MIPSEL) || defined(_MIPSEL))) +#define CPU_LE +#elif ((defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN)) || \ + (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && (BYTE_ORDER == BIG_ENDIAN)) || \ + (defined(CPU_SPARC) || defined(CPU_PPC) || defined(mc68000) || defined(sel)) || defined(_MIPSEB)) +#define CPU_BE +#else + /* unknown endian! */ +#endif + + +#define U8TO32_BE(p) \ + (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \ + ((uint32_t)((p)[2]) << 8) | ((uint32_t)((p)[3]) )) + +#define U8TO32_LE(p) \ + (((uint32_t)((p)[0]) ) | ((uint32_t)((p)[1]) << 8) | \ + ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24)) + +#define U32TO8_BE(p, v) \ + (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \ + (p)[2] = (uint8_t)((v) >> 8); (p)[3] = (uint8_t)((v) ); + +#define U32TO8_LE(p, v) \ + (p)[0] = (uint8_t)((v) ); (p)[1] = (uint8_t)((v) >> 8); \ + (p)[2] = (uint8_t)((v) >> 16); (p)[3] = (uint8_t)((v) >> 24); + +#define U8TO64_BE(p) \ + (((uint64_t)U8TO32_BE(p) << 32) | (uint64_t)U8TO32_BE((p) + 4)) + +#define U8TO64_LE(p) \ + (((uint64_t)U8TO32_LE(p)) | ((uint64_t)U8TO32_LE((p) + 4) << 32)) + +#define U64TO8_BE(p, v) \ + U32TO8_BE((p), (uint32_t)((v) >> 32)); \ + U32TO8_BE((p) + 4, (uint32_t)((v) )); + +#define U64TO8_LE(p, v) \ + U32TO8_LE((p), (uint32_t)((v) )); \ + U32TO8_LE((p) + 4, (uint32_t)((v) >> 32)); + +#define U32_SWAP(v) { \ + (v) = (((v) << 8) & 0xFF00FF00 ) | (((v) >> 8) & 0xFF00FF ); \ + (v) = ((v) << 16) | ((v) >> 16); \ +} + +#define U64_SWAP(v) { \ + (v) = (((v) << 8) & 0xFF00FF00FF00FF00ull ) | (((v) >> 8) & 0x00FF00FF00FF00FFull ); \ + (v) = (((v) << 16) & 0xFFFF0000FFFF0000ull ) | (((v) >> 16) & 0x0000FFFF0000FFFFull ); \ + (v) = ((v) << 32) | ((v) >> 32); \ +} + +static int +scrypt_verify(const uint8_t *x, const uint8_t *y, size_t len) { + uint32_t differentbits = 0; + while (len--) + differentbits |= (*x++ ^ *y++); + return (1 & ((differentbits - 1) >> 8)); +} + +void +scrypt_ensure_zero(void *p, size_t len) { +#if ((defined(CPU_X86) || defined(CPU_X86_64)) && defined(COMPILER_MSVC)) + __stosb((unsigned char *)p, 0, len); +#elif (defined(CPU_X86) && defined(COMPILER_GCC)) + __asm__ __volatile__( + "pushl %%edi;\n" + "pushl %%ecx;\n" + "rep stosb;\n" + "popl %%ecx;\n" + "popl %%edi;\n" + :: "a"(0), "D"(p), "c"(len) : "cc", "memory" + ); +#elif (defined(CPU_X86_64) && defined(COMPILER_GCC)) + __asm__ __volatile__( + "pushq %%rdi;\n" + "pushq %%rcx;\n" + "rep stosb;\n" + "popq %%rcx;\n" + "popq %%rdi;\n" + :: "a"(0), "D"(p), "c"(len) : "cc", "memory" + ); +#else + volatile uint8_t *b = (volatile uint8_t *)p; + size_t i; + for (i = 0; i < len; i++) + b[i] = 0; +#endif +} + +#include "scrypt-jane-portable-x86.h" + diff --git a/scryptjane/scrypt-jane-romix-basic.h b/scryptjane/scrypt-jane-romix-basic.h new file mode 100644 index 00000000..ca1df02d --- /dev/null +++ b/scryptjane/scrypt-jane-romix-basic.h @@ -0,0 +1,67 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +/* function type returned by scrypt_getROMix, used with cpu detection */ +typedef void (FASTCALL *scrypt_ROMixfn)(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r); +#endif + +/* romix pre/post nop function */ +static void STDCALL +scrypt_romix_nop(scrypt_mix_word_t *blocks, size_t nblocks) { +} + +/* romix pre/post endian conversion function */ +static void STDCALL +scrypt_romix_convert_endian(scrypt_mix_word_t *blocks, size_t nblocks) { +#if !defined(CPU_LE) + static const union { uint8_t b[2]; uint16_t w; } endian_test = {{1,0}}; + size_t i; + if (endian_test.w == 0x100) { + nblocks *= SCRYPT_BLOCK_WORDS; + for (i = 0; i < nblocks; i++) { + SCRYPT_WORD_ENDIAN_SWAP(blocks[i]); + } + } +#endif +} + +/* chunkmix test function */ +typedef void (STDCALL *chunkmixfn)(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r); +typedef void (STDCALL *blockfixfn)(scrypt_mix_word_t *blocks, size_t nblocks); + +static int +scrypt_test_mix_instance(chunkmixfn mixfn, blockfixfn prefn, blockfixfn postfn, const uint8_t expected[16]) { + /* r = 2, (2 * r) = 4 blocks in a chunk, 4 * SCRYPT_BLOCK_WORDS total */ + const uint32_t r = 2, blocks = 2 * r, words = blocks * SCRYPT_BLOCK_WORDS; + scrypt_mix_word_t MM16 chunk[2][4 * SCRYPT_BLOCK_WORDS], v; + uint8_t final[16]; + size_t i; + + for (i = 0; i < words; i++) { + v = (scrypt_mix_word_t)i; + v = (v << 8) | v; + v = (v << 16) | v; + chunk[0][i] = v; + } + + prefn(chunk[0], blocks); + mixfn(chunk[1], chunk[0], NULL, r); + postfn(chunk[1], blocks); + + /* grab the last 16 bytes of the final block */ + for (i = 0; i < 16; i += sizeof(scrypt_mix_word_t)) { + SCRYPT_WORDTO8_LE(final + i, chunk[1][words - (16 / sizeof(scrypt_mix_word_t)) + (i / sizeof(scrypt_mix_word_t))]); + } + + return scrypt_verify(expected, final, 16); +} + +/* returns a pointer to item i, where item is len scrypt_mix_word_t's long */ +static scrypt_mix_word_t * +scrypt_item(scrypt_mix_word_t *base, scrypt_mix_word_t i, scrypt_mix_word_t len) { + return base + (i * len); +} + +/* returns a pointer to block i */ +static scrypt_mix_word_t * +scrypt_block(scrypt_mix_word_t *base, scrypt_mix_word_t i) { + return base + (i * SCRYPT_BLOCK_WORDS); +} diff --git a/scryptjane/scrypt-jane-romix-template.h b/scryptjane/scrypt-jane-romix-template.h new file mode 100644 index 00000000..2fd7674e --- /dev/null +++ b/scryptjane/scrypt-jane-romix-template.h @@ -0,0 +1,118 @@ +#if !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) + +#if defined(SCRYPT_CHOOSE_COMPILETIME) +#undef SCRYPT_ROMIX_FN +#define SCRYPT_ROMIX_FN scrypt_ROMix +#endif + +#undef SCRYPT_HAVE_ROMIX +#define SCRYPT_HAVE_ROMIX + +#if !defined(SCRYPT_CHUNKMIX_FN) + +#define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_basic + +/* + Bout = ChunkMix(Bin) + + 2*r: number of blocks in the chunk +*/ +static void STDCALL +SCRYPT_CHUNKMIX_FN(scrypt_mix_word_t *Bout/*[chunkWords]*/, scrypt_mix_word_t *Bin/*[chunkWords]*/, scrypt_mix_word_t *Bxor/*[chunkWords]*/, uint32_t r) { + scrypt_mix_word_t MM16 X[SCRYPT_BLOCK_WORDS], *block; + uint32_t i, j, blocksPerChunk = r * 2, half = 0; + + /* 1: X = B_{2r - 1} */ + block = scrypt_block(Bin, blocksPerChunk - 1); + for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) + X[i] = block[i]; + + if (Bxor) { + block = scrypt_block(Bxor, blocksPerChunk - 1); + for (i = 0; i < SCRYPT_BLOCK_WORDS; i++) + X[i] ^= block[i]; + } + + /* 2: for i = 0 to 2r - 1 do */ + for (i = 0; i < blocksPerChunk; i++, half ^= r) { + /* 3: X = H(X ^ B_i) */ + block = scrypt_block(Bin, i); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + X[j] ^= block[j]; + + if (Bxor) { + block = scrypt_block(Bxor, i); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + X[j] ^= block[j]; + } + SCRYPT_MIX_FN(X); + + /* 4: Y_i = X */ + /* 6: B'[0..r-1] = Y_even */ + /* 6: B'[r..2r-1] = Y_odd */ + block = scrypt_block(Bout, (i / 2) + half); + for (j = 0; j < SCRYPT_BLOCK_WORDS; j++) + block[j] = X[j]; + } +} +#endif + +/* + X = ROMix(X) + + X: chunk to mix + Y: scratch chunk + N: number of rounds + V[N]: array of chunks to randomly index in to + 2*r: number of blocks in a chunk +*/ + +static void NOINLINE FASTCALL +SCRYPT_ROMIX_FN(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[N * chunkWords]*/, uint32_t N, uint32_t r) { + uint32_t i, j, chunkWords = SCRYPT_BLOCK_WORDS * r * 2; + scrypt_mix_word_t *block = V; + + SCRYPT_ROMIX_TANGLE_FN(X, r * 2); + + /* 1: X = B */ + /* implicit */ + + /* 2: for i = 0 to N - 1 do */ + memcpy(block, X, chunkWords * sizeof(scrypt_mix_word_t)); + for (i = 0; i < N - 1; i++, block += chunkWords) { + /* 3: V_i = X */ + /* 4: X = H(X) */ + SCRYPT_CHUNKMIX_FN(block + chunkWords, block, NULL, r); + } + SCRYPT_CHUNKMIX_FN(X, block, NULL, r); + + /* 6: for i = 0 to N - 1 do */ + for (i = 0; i < N; i += 2) { + /* 7: j = Integerify(X) % N */ + j = X[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); + + /* 8: X = H(Y ^ V_j) */ + SCRYPT_CHUNKMIX_FN(Y, X, scrypt_item(V, j, chunkWords), r); + + /* 7: j = Integerify(Y) % N */ + j = Y[chunkWords - SCRYPT_BLOCK_WORDS] & (N - 1); + + /* 8: X = H(Y ^ V_j) */ + SCRYPT_CHUNKMIX_FN(X, Y, scrypt_item(V, j, chunkWords), r); + } + + /* 10: B' = X */ + /* implicit */ + + SCRYPT_ROMIX_UNTANGLE_FN(X, r * 2); +} + +#endif /* !defined(SCRYPT_CHOOSE_COMPILETIME) || !defined(SCRYPT_HAVE_ROMIX) */ + + +#undef SCRYPT_CHUNKMIX_FN +#undef SCRYPT_ROMIX_FN +#undef SCRYPT_MIX_FN +#undef SCRYPT_ROMIX_TANGLE_FN +#undef SCRYPT_ROMIX_UNTANGLE_FN + diff --git a/scryptjane/scrypt-jane-romix.h b/scryptjane/scrypt-jane-romix.h new file mode 100644 index 00000000..faa655a0 --- /dev/null +++ b/scryptjane/scrypt-jane-romix.h @@ -0,0 +1,27 @@ +#if defined(SCRYPT_CHACHA) +#include "scrypt-jane-chacha.h" +#elif defined(SCRYPT_SALSA) +#include "scrypt-jane-salsa.h" +#elif defined(SCRYPT_SALSA64) +#include "scrypt-jane-salsa64.h" +#else + #define SCRYPT_MIX_BASE "ERROR" + typedef uint32_t scrypt_mix_word_t; + #define SCRYPT_WORDTO8_LE U32TO8_LE + #define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP + #define SCRYPT_BLOCK_BYTES 64 + #define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + #if !defined(SCRYPT_CHOOSE_COMPILETIME) + static void FASTCALL scrypt_ROMix_error(scrypt_mix_word_t *X/*[chunkWords]*/, scrypt_mix_word_t *Y/*[chunkWords]*/, scrypt_mix_word_t *V/*[chunkWords * N]*/, uint32_t N, uint32_t r) {} + static scrypt_ROMixfn scrypt_getROMix() { return scrypt_ROMix_error; } + #else + static void FASTCALL scrypt_ROMix(scrypt_mix_word_t *X, scrypt_mix_word_t *Y, scrypt_mix_word_t *V, uint32_t N, uint32_t r) {} + #endif + static int scrypt_test_mix() { return 0; } + #error must define a mix function! +#endif + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +#undef SCRYPT_MIX +#define SCRYPT_MIX SCRYPT_MIX_BASE +#endif diff --git a/scryptjane/scrypt-jane-salsa.h b/scryptjane/scrypt-jane-salsa.h new file mode 100644 index 00000000..0c1604ba --- /dev/null +++ b/scryptjane/scrypt-jane-salsa.h @@ -0,0 +1,106 @@ +#define SCRYPT_MIX_BASE "Salsa20/8" + +typedef uint32_t scrypt_mix_word_t; + +#define SCRYPT_WORDTO8_LE U32TO8_LE +#define SCRYPT_WORD_ENDIAN_SWAP U32_SWAP + +#define SCRYPT_BLOCK_BYTES 64 +#define SCRYPT_BLOCK_WORDS (SCRYPT_BLOCK_BYTES / sizeof(scrypt_mix_word_t)) + +/* must have these here in case block bytes is ever != 64 */ +#include "scrypt-jane-romix-basic.h" + +#include "scrypt-jane-mix_salsa-avx.h" +#include "scrypt-jane-mix_salsa-sse2.h" +#include "scrypt-jane-mix_salsa.h" + +#if defined(SCRYPT_SALSA_AVX) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_avx + #define SCRYPT_ROMIX_FN scrypt_ROMix_avx + #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +#if defined(SCRYPT_SALSA_SSE2) + #define SCRYPT_CHUNKMIX_FN scrypt_ChunkMix_sse2 + #define SCRYPT_ROMIX_FN scrypt_ROMix_sse2 + #define SCRYPT_MIX_FN salsa_core_sse2 + #define SCRYPT_ROMIX_TANGLE_FN salsa_core_tangle_sse2 + #define SCRYPT_ROMIX_UNTANGLE_FN salsa_core_tangle_sse2 + #include "scrypt-jane-romix-template.h" +#endif + +/* cpu agnostic */ +#define SCRYPT_ROMIX_FN scrypt_ROMix_basic +#define SCRYPT_MIX_FN salsa_core_basic +#define SCRYPT_ROMIX_TANGLE_FN scrypt_romix_convert_endian +#define SCRYPT_ROMIX_UNTANGLE_FN scrypt_romix_convert_endian +#include "scrypt-jane-romix-template.h" + +#if !defined(SCRYPT_CHOOSE_COMPILETIME) +static scrypt_ROMixfn +scrypt_getROMix() { + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA_AVX) + if (cpuflags & cpu_avx) + return scrypt_ROMix_avx; + else +#endif + +#if defined(SCRYPT_SALSA_SSE2) + if (cpuflags & cpu_sse2) + return scrypt_ROMix_sse2; + else +#endif + + return scrypt_ROMix_basic; +} +#endif + + +#if defined(SCRYPT_TEST_SPEED) +static size_t +available_implementations() { + size_t flags = 0; + +#if defined(SCRYPT_SALSA_AVX) + flags |= cpu_avx; +#endif + +#if defined(SCRYPT_SALSA_SSE2) + flags |= cpu_sse2; +#endif + + return flags; +} +#endif + + +static int +scrypt_test_mix() { + static const uint8_t expected[16] = { + 0x41,0x1f,0x2e,0xa3,0xab,0xa3,0x1a,0x34,0x87,0x1d,0x8a,0x1c,0x76,0xa0,0x27,0x66, + }; + + int ret = 1; + size_t cpuflags = detect_cpu(); + +#if defined(SCRYPT_SALSA_AVX) + if (cpuflags & cpu_avx) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_avx, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA_SSE2) + if (cpuflags & cpu_sse2) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_sse2, salsa_core_tangle_sse2, salsa_core_tangle_sse2, expected); +#endif + +#if defined(SCRYPT_SALSA_BASIC) + ret &= scrypt_test_mix_instance(scrypt_ChunkMix_basic, scrypt_romix_convert_endian, scrypt_romix_convert_endian, expected); +#endif + + return ret; +} diff --git a/scryptjane/scrypt-jane-test-vectors.h b/scryptjane/scrypt-jane-test-vectors.h new file mode 100644 index 00000000..a1e4c619 --- /dev/null +++ b/scryptjane/scrypt-jane-test-vectors.h @@ -0,0 +1,261 @@ +typedef struct scrypt_test_setting_t { + const char *pw, *salt; + uint8_t Nfactor, rfactor, pfactor; +} scrypt_test_setting; + +static const scrypt_test_setting post_settings[] = { + {"", "", 3, 0, 0}, + {"password", "NaCl", 9, 3, 4}, + {0} +}; + +#if defined(SCRYPT_SHA256) + #if defined(SCRYPT_SALSA) + /* sha256 + salsa20/8, the only 'official' test vectors! */ + static const uint8_t post_vectors[][64] = { + {0x77,0xd6,0x57,0x62,0x38,0x65,0x7b,0x20,0x3b,0x19,0xca,0x42,0xc1,0x8a,0x04,0x97, + 0xf1,0x6b,0x48,0x44,0xe3,0x07,0x4a,0xe8,0xdf,0xdf,0xfa,0x3f,0xed,0xe2,0x14,0x42, + 0xfc,0xd0,0x06,0x9d,0xed,0x09,0x48,0xf8,0x32,0x6a,0x75,0x3a,0x0f,0xc8,0x1f,0x17, + 0xe8,0xd3,0xe0,0xfb,0x2e,0x0d,0x36,0x28,0xcf,0x35,0xe2,0x0c,0x38,0xd1,0x89,0x06}, + {0xfd,0xba,0xbe,0x1c,0x9d,0x34,0x72,0x00,0x78,0x56,0xe7,0x19,0x0d,0x01,0xe9,0xfe, + 0x7c,0x6a,0xd7,0xcb,0xc8,0x23,0x78,0x30,0xe7,0x73,0x76,0x63,0x4b,0x37,0x31,0x62, + 0x2e,0xaf,0x30,0xd9,0x2e,0x22,0xa3,0x88,0x6f,0xf1,0x09,0x27,0x9d,0x98,0x30,0xda, + 0xc7,0x27,0xaf,0xb9,0x4a,0x83,0xee,0x6d,0x83,0x60,0xcb,0xdf,0xa2,0xcc,0x06,0x40} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0xef,0x8f,0x44,0x8f,0xc3,0xef,0x78,0x13,0xb2,0x26,0xa7,0x2a,0x40,0xa1,0x98,0x7f, + 0xc8,0x7f,0x0d,0x5f,0x40,0x66,0xa2,0x05,0x07,0x4f,0xc7,0xac,0x3b,0x47,0x07,0x0c, + 0xf5,0x20,0x46,0x76,0x20,0x7b,0xee,0x51,0x6d,0x5f,0xfa,0x9c,0x27,0xac,0xa9,0x36, + 0x62,0xbd,0xde,0x0b,0xa3,0xc0,0x66,0x84,0xde,0x82,0xd0,0x1a,0xb4,0xd1,0xb5,0xfe}, + {0xf1,0x94,0xf7,0x5f,0x15,0x12,0x10,0x4d,0x6e,0xfb,0x04,0x8c,0x35,0xc4,0x51,0xb6, + 0x11,0x04,0xa7,0x9b,0xb0,0x46,0xaf,0x7b,0x47,0x39,0xf0,0xac,0xb2,0x8a,0xfa,0x45, + 0x09,0x86,0x8f,0x10,0x4b,0xc6,0xee,0x00,0x11,0x38,0x73,0x7a,0x6a,0xd8,0x25,0x67, + 0x85,0xa4,0x10,0x4e,0xa9,0x2f,0x15,0xfe,0xcf,0x63,0xe1,0xe8,0xcf,0xab,0xe8,0xbd} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xf4,0x87,0x29,0xf4,0xc3,0x31,0x8c,0xe8,0xdf,0xe5,0xd8,0x73,0xff,0xca,0x32,0xcf, + 0xd8,0xac,0xe7,0xf7,0x15,0xda,0x84,0x41,0x60,0x23,0x26,0x4a,0xc8,0x3e,0xee,0xa6, + 0xa5,0x6e,0x52,0xd6,0x64,0x55,0x16,0x31,0x3e,0x66,0x7b,0x65,0xd5,0xe2,0xc9,0x95, + 0x1b,0xf0,0x81,0x40,0xb7,0x2f,0xff,0xa6,0xe6,0x02,0xcc,0x63,0x08,0x4a,0x74,0x31}, + {0x7a,0xd8,0xad,0x02,0x9c,0xa5,0xf4,0x42,0x6a,0x29,0xd2,0xb5,0x53,0xf1,0x6d,0x1d, + 0x25,0xc8,0x70,0x48,0x80,0xb9,0xa3,0xf6,0x94,0xf8,0xfa,0xb8,0x52,0x42,0xcd,0x14, + 0x26,0x46,0x28,0x06,0xc7,0xf6,0x1f,0xa7,0x89,0x6d,0xc5,0xa0,0x36,0xcc,0xde,0xcb, + 0x73,0x0b,0xa4,0xe2,0xd3,0xd1,0x44,0x06,0x35,0x08,0xe0,0x35,0x5b,0xf8,0xd7,0xe7} + }; + #endif +#elif defined(SCRYPT_SHA512) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0xae,0x54,0xe7,0x74,0xe4,0x51,0x6b,0x0f,0xe1,0xe7,0x28,0x03,0x17,0xe4,0x8c,0xfa, + 0x2f,0x66,0x55,0x7f,0xdc,0x3b,0x40,0xab,0x47,0x84,0xc9,0x63,0x36,0x07,0x9d,0xe5, + 0x86,0x43,0x95,0x89,0xb6,0xc0,0x6c,0x72,0x64,0x00,0xc1,0x2a,0xd7,0x69,0x21,0x92, + 0x8e,0xba,0xa4,0x59,0x9f,0x00,0x14,0x3a,0x7c,0x12,0x58,0x91,0x09,0xa0,0x32,0xfe}, + {0xc5,0xb3,0xd6,0xea,0x0a,0x4b,0x1e,0xcc,0x40,0x00,0xe5,0x98,0x5c,0xdc,0x06,0x06, + 0x78,0x34,0x92,0x16,0xcf,0xe4,0x9f,0x03,0x96,0x2d,0x41,0x35,0x00,0x9b,0xff,0x74, + 0x60,0x19,0x6e,0xe6,0xa6,0x46,0xf7,0x37,0xcb,0xfa,0xd0,0x9f,0x80,0x72,0x2e,0x85, + 0x13,0x3e,0x1a,0x91,0x90,0x53,0xa1,0x33,0x85,0x51,0xdc,0x62,0x1c,0x0e,0x4d,0x30} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0xe2,0x05,0x7c,0x44,0xf9,0x55,0x9f,0x64,0xbe,0xd5,0x7f,0x85,0x69,0xc7,0x8c,0x7f, + 0x2b,0x91,0xd6,0x9a,0x6c,0xf8,0x57,0x55,0x61,0x25,0x3d,0xee,0xb8,0xd5,0x8c,0xdc, + 0x2d,0xd5,0x53,0x84,0x8c,0x06,0xaa,0x37,0x77,0xa6,0xf0,0xf1,0x35,0xfe,0xb5,0xcb, + 0x61,0xd7,0x2c,0x67,0xf3,0x7e,0x8a,0x1b,0x04,0xa3,0xa3,0x43,0xa2,0xb2,0x29,0xf2}, + {0x82,0xda,0x29,0xb2,0x08,0x27,0xfc,0x78,0x22,0xc4,0xb8,0x7e,0xbc,0x36,0xcf,0xcd, + 0x17,0x4b,0xa1,0x30,0x16,0x4a,0x25,0x70,0xc7,0xcb,0xe0,0x2b,0x56,0xd3,0x16,0x4e, + 0x85,0xb6,0x84,0xe7,0x9b,0x7f,0x8b,0xb5,0x94,0x33,0xcf,0x33,0x44,0x65,0xc8,0xa1, + 0x46,0xf9,0xf5,0xfc,0x74,0x29,0x7e,0xd5,0x46,0xec,0xbd,0x95,0xc1,0x80,0x24,0xe4} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xa6,0xcb,0x77,0x9a,0x64,0x1f,0x95,0x02,0x53,0xe7,0x5c,0x78,0xdb,0xa3,0x43,0xff, + 0xbe,0x10,0x4c,0x7b,0xe4,0xe1,0x91,0xcf,0x67,0x69,0x5a,0x2c,0x12,0xd6,0x99,0x49, + 0x92,0xfd,0x5a,0xaa,0x12,0x4c,0x2e,0xf6,0x95,0x46,0x8f,0x5e,0x77,0x62,0x16,0x29, + 0xdb,0xe7,0xab,0x02,0x2b,0x9c,0x35,0x03,0xf8,0xd4,0x04,0x7d,0x2d,0x73,0x85,0xf1}, + {0x54,0xb7,0xca,0xbb,0xaf,0x0f,0xb0,0x5f,0xb7,0x10,0x63,0x48,0xb3,0x15,0xd8,0xb5, + 0x62,0x64,0x89,0x6a,0x59,0xc6,0x0f,0x86,0x96,0x38,0xf0,0xcf,0xd4,0x62,0x90,0x61, + 0x7d,0xce,0xd6,0x13,0x85,0x67,0x4a,0xf5,0x32,0x03,0x74,0x30,0x0b,0x5a,0x2f,0x86, + 0x82,0x6e,0x0c,0x3e,0x40,0x7a,0xde,0xbe,0x42,0x6e,0x80,0x2b,0xaf,0xdb,0xcc,0x94} + }; + #endif +#elif defined(SCRYPT_BLAKE512) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0x4a,0x48,0xb3,0xfa,0xdc,0xb0,0xb8,0xdb,0x54,0xee,0xf3,0x5c,0x27,0x65,0x6c,0x20, + 0xab,0x61,0x9a,0x5b,0xd5,0x1d,0xd9,0x95,0xab,0x88,0x0e,0x4d,0x1e,0x71,0x2f,0x11, + 0x43,0x2e,0xef,0x23,0xca,0x8a,0x49,0x3b,0x11,0x38,0xa5,0x28,0x61,0x2f,0xb7,0x89, + 0x5d,0xef,0x42,0x4c,0xc1,0x74,0xea,0x8a,0x56,0xbe,0x4a,0x82,0x76,0x15,0x1a,0x87}, + {0x96,0x24,0xbf,0x40,0xeb,0x03,0x8e,0xfe,0xc0,0xd5,0xa4,0x81,0x85,0x7b,0x09,0x88, + 0x52,0xb5,0xcb,0xc4,0x48,0xe1,0xb9,0x1d,0x3f,0x8b,0x3a,0xc6,0x38,0x32,0xc7,0x55, + 0x30,0x28,0x7a,0x42,0xa9,0x5d,0x54,0x33,0x62,0xf3,0xd9,0x3c,0x96,0x40,0xd1,0x80, + 0xe4,0x0e,0x7e,0xf0,0x64,0x53,0xfe,0x7b,0xd7,0x15,0xba,0xad,0x16,0x80,0x01,0xb5} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0x45,0x42,0x22,0x31,0x26,0x13,0x5f,0x94,0xa4,0x00,0x04,0x47,0xe8,0x50,0x6d,0xd6, + 0xdd,0xd5,0x08,0xd4,0x90,0x64,0xe0,0x59,0x70,0x46,0xff,0xfc,0x29,0xb3,0x6a,0xc9, + 0x4d,0x45,0x97,0x95,0xa8,0xf0,0x53,0xe7,0xee,0x4b,0x6b,0x5d,0x1e,0xa5,0xb2,0x58, + 0x4b,0x93,0xc9,0x89,0x4c,0xa8,0xab,0x03,0x74,0x38,0xbd,0x54,0x97,0x6b,0xab,0x4a}, + {0x4b,0x4a,0x63,0x96,0x73,0x34,0x9f,0x39,0x64,0x51,0x0e,0x2e,0x3b,0x07,0xd5,0x1c, + 0xd2,0xf7,0xce,0x60,0xab,0xac,0x89,0xa4,0x16,0x0c,0x58,0x82,0xb3,0xd3,0x25,0x5b, + 0xd5,0x62,0x32,0xf4,0x86,0x5d,0xb2,0x4b,0xbf,0x8e,0xc6,0xc0,0xac,0x40,0x48,0xb4, + 0x69,0x08,0xba,0x40,0x4b,0x07,0x2a,0x13,0x9c,0x98,0x3b,0x8b,0x20,0x0c,0xac,0x9e} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xcb,0x4b,0xc2,0xd1,0xf4,0x77,0x32,0x3c,0x42,0x9d,0xf7,0x7d,0x1f,0x22,0x64,0xa4, + 0xe2,0x88,0x30,0x2d,0x54,0x9d,0xb6,0x26,0x89,0x25,0x30,0xc3,0x3d,0xdb,0xba,0x99, + 0xe9,0x8e,0x1e,0x5e,0x57,0x66,0x75,0x7c,0x24,0xda,0x00,0x6f,0x79,0xf7,0x47,0xf5, + 0xea,0x40,0x70,0x37,0xd2,0x91,0xc7,0x4d,0xdf,0x46,0xb6,0x3e,0x95,0x7d,0xcb,0xc1}, + {0x25,0xc2,0xcb,0x7f,0xc8,0x50,0xb7,0x0b,0x11,0x9e,0x1d,0x10,0xb2,0xa8,0x35,0x23, + 0x91,0x39,0xfb,0x45,0xf2,0xbf,0xe4,0xd0,0x84,0xec,0x72,0x33,0x6d,0x09,0xed,0x41, + 0x9a,0x7e,0x4f,0x10,0x73,0x97,0x22,0x76,0x58,0x93,0x39,0x24,0xdf,0xd2,0xaa,0x2f, + 0x6b,0x2b,0x64,0x48,0xa5,0xb7,0xf5,0x56,0x77,0x02,0xa7,0x71,0x46,0xe5,0x0e,0x8d}, + }; + #endif +#elif defined(SCRYPT_BLAKE256) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0xf1,0xf1,0x91,0x1a,0x81,0xe6,0x9f,0xc1,0xce,0x43,0xab,0xb1,0x1a,0x02,0x1e,0x16, + 0x08,0xc6,0xf9,0x00,0x50,0x1b,0x6d,0xf1,0x31,0x06,0x95,0x48,0x5d,0xf7,0x6c,0x00, + 0xa2,0x4c,0xb1,0x0e,0x52,0x66,0x94,0x7e,0x84,0xfc,0xa5,0x34,0xfd,0xf0,0xe9,0x57, + 0x85,0x2d,0x8c,0x05,0x5c,0x0f,0x04,0xd4,0x8d,0x3e,0x13,0x52,0x3d,0x90,0x2d,0x2c}, + {0xd5,0x42,0xd2,0x7b,0x06,0xae,0x63,0x90,0x9e,0x30,0x00,0x0e,0xd8,0xa4,0x3a,0x0b, + 0xee,0x4a,0xef,0xb2,0xc4,0x95,0x0d,0x72,0x07,0x70,0xcc,0xa3,0xf9,0x1e,0xc2,0x75, + 0xcf,0xaf,0xe1,0x44,0x1c,0x8c,0xe2,0x3e,0x0c,0x81,0xf3,0x92,0xe1,0x13,0xe6,0x4f, + 0x2d,0x27,0xc3,0x87,0xe5,0xb6,0xf9,0xd7,0x02,0x04,0x37,0x64,0x78,0x36,0x6e,0xb3} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0xad,0x1b,0x4b,0xca,0xe3,0x26,0x1a,0xfd,0xb7,0x77,0x8c,0xde,0x8d,0x26,0x14,0xe1, + 0x54,0x38,0x42,0xf3,0xb3,0x66,0x29,0xf9,0x90,0x04,0xf1,0x82,0x7c,0x5a,0x6f,0xa8, + 0x7d,0xd6,0x08,0x0d,0x8b,0x78,0x04,0xad,0x31,0xea,0xd4,0x87,0x2d,0xf7,0x74,0x9a, + 0xe5,0xce,0x97,0xef,0xa3,0xbb,0x90,0x46,0x7c,0xf4,0x51,0x38,0xc7,0x60,0x53,0x21}, + {0x39,0xbb,0x56,0x3d,0x0d,0x7b,0x74,0x82,0xfe,0x5a,0x78,0x3d,0x66,0xe8,0x3a,0xdf, + 0x51,0x6f,0x3e,0xf4,0x86,0x20,0x8d,0xe1,0x81,0x22,0x02,0xf7,0x0d,0xb5,0x1a,0x0f, + 0xfc,0x59,0xb6,0x60,0xc9,0xdb,0x38,0x0b,0x5b,0x95,0xa5,0x94,0xda,0x42,0x2d,0x90, + 0x47,0xeb,0x73,0x31,0x9f,0x20,0xf6,0x81,0xc2,0xef,0x33,0x77,0x51,0xd8,0x2c,0xe4} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0x9e,0xf2,0x60,0x7c,0xbd,0x7c,0x19,0x5c,0x79,0xc6,0x1b,0x7e,0xb0,0x65,0x1b,0xc3, + 0x70,0x0d,0x89,0xfc,0x72,0xb2,0x03,0x72,0x15,0xcb,0x8e,0x8c,0x49,0x50,0x4c,0x27, + 0x99,0xda,0x47,0x32,0x5e,0xb4,0xa2,0x07,0x83,0x51,0x6b,0x06,0x37,0x60,0x42,0xc4, + 0x59,0x49,0x99,0xdd,0xc0,0xd2,0x08,0x94,0x7f,0xe3,0x9e,0x4e,0x43,0x8e,0x5b,0xba}, + {0x86,0x6f,0x3b,0x11,0xb8,0xca,0x4b,0x6e,0xa7,0x6f,0xc2,0xc9,0x33,0xb7,0x8b,0x9f, + 0xa3,0xb9,0xf5,0xb5,0x62,0xa6,0x17,0x66,0xe4,0xc3,0x9d,0x9b,0xca,0x51,0xb0,0x2f, + 0xda,0x09,0xc1,0x77,0xed,0x8b,0x89,0xc2,0x69,0x5a,0x34,0x05,0x4a,0x1f,0x4d,0x76, + 0xcb,0xd5,0xa4,0x78,0xfa,0x1b,0xb9,0x5b,0xbc,0x3d,0xce,0x04,0x63,0x99,0xad,0x54} + }; + #endif +#elif defined(SCRYPT_SKEIN512) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0xe4,0x36,0xa0,0x9a,0xdb,0xf0,0xd1,0x45,0x56,0xda,0x25,0x53,0x00,0xf9,0x2c,0x69, + 0xa4,0xc2,0xa5,0x8e,0x1a,0x85,0xfa,0x53,0xbd,0x55,0x3d,0x11,0x2a,0x44,0x13,0x87, + 0x8f,0x81,0x88,0x13,0x1e,0x49,0xa8,0xc4,0xc5,0xcd,0x1f,0xe1,0x5f,0xf5,0xcb,0x2f, + 0x8b,0xab,0x57,0x38,0x59,0xeb,0x6b,0xac,0x3b,0x73,0x10,0xa6,0xe1,0xfe,0x17,0x3e}, + {0x6d,0x61,0xde,0x43,0xa9,0x38,0x53,0x5f,0xd8,0xf2,0x6d,0xf3,0xe4,0xd6,0xd8,0x5e, + 0x81,0x89,0xd0,0x0b,0x86,0x16,0xb1,0x91,0x65,0x76,0xd8,0xc1,0xf7,0x3b,0xca,0x8b, + 0x35,0x07,0x58,0xba,0x77,0xdf,0x11,0x6c,0xbc,0x58,0xee,0x11,0x59,0xf2,0xfe,0xcb, + 0x51,0xdc,0xcd,0x35,0x2e,0x46,0x22,0xa0,0xaa,0x55,0x60,0x7c,0x91,0x15,0xb8,0x00} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0xd1,0x12,0x6d,0x64,0x10,0x0e,0x98,0x6c,0xbe,0x70,0x21,0xd9,0xc6,0x04,0x62,0xa4, + 0x29,0x13,0x9a,0x3c,0xf8,0xe9,0x1e,0x87,0x9f,0x88,0xf4,0x98,0x01,0x41,0x8e,0xce, + 0x60,0xf7,0xbe,0x17,0x0a,0xec,0xd6,0x30,0x80,0xcf,0x6b,0x1e,0xcf,0x95,0xa0,0x4d, + 0x37,0xed,0x3a,0x09,0xd1,0xeb,0x0c,0x80,0x82,0x22,0x8e,0xd3,0xb1,0x7f,0xd6,0xa8}, + {0x5c,0x5c,0x05,0xe2,0x75,0xa5,0xa4,0xec,0x81,0x97,0x9c,0x5b,0xd7,0x26,0xb3,0x16, + 0xb4,0x02,0x8c,0x56,0xe6,0x32,0x57,0x33,0x47,0x19,0x06,0x6c,0xde,0x68,0x41,0x37, + 0x5b,0x7d,0xa7,0xb3,0x73,0xeb,0x82,0xca,0x0f,0x86,0x2e,0x6b,0x47,0xa2,0x70,0x39, + 0x35,0xfd,0x2d,0x2e,0x7b,0xc3,0x68,0xbb,0x52,0x42,0x19,0x3b,0x78,0x96,0xe7,0xc8} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xd2,0xad,0x32,0x05,0xee,0x80,0xe3,0x44,0x70,0xc6,0x34,0xde,0x05,0xb6,0xcf,0x60, + 0x89,0x98,0x70,0xc0,0xb8,0xf5,0x54,0xf1,0xa6,0xb2,0xc8,0x76,0x34,0xec,0xc4,0x59, + 0x8e,0x64,0x42,0xd0,0xa9,0xed,0xe7,0x19,0xb2,0x8a,0x11,0xc6,0xa6,0xbf,0xa7,0xa9, + 0x4e,0x44,0x32,0x7e,0x12,0x91,0x9d,0xfe,0x52,0x48,0xa8,0x27,0xb3,0xfc,0xb1,0x89}, + {0xd6,0x67,0xd2,0x3e,0x30,0x1e,0x9d,0xe2,0x55,0x68,0x17,0x3d,0x2b,0x75,0x5a,0xe5, + 0x04,0xfb,0x3d,0x0e,0x86,0xe0,0xaa,0x1d,0xd4,0x72,0xda,0xb0,0x79,0x41,0xb7,0x99, + 0x68,0xe5,0xd9,0x55,0x79,0x7d,0xc3,0xd1,0xa6,0x56,0xc1,0xbe,0x0b,0x6c,0x62,0x23, + 0x66,0x67,0x91,0x47,0x99,0x13,0x6b,0xe3,0xda,0x59,0x55,0x18,0x67,0x8f,0x2e,0x3b} + }; + #endif +#elif defined(SCRYPT_KECCAK512) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0xc2,0x7b,0xbe,0x1d,0xf1,0x99,0xd8,0xe7,0x1b,0xac,0xe0,0x9d,0xeb,0x5a,0xfe,0x21, + 0x71,0xff,0x41,0x51,0x4f,0xbe,0x41,0x01,0x15,0xe2,0xb7,0xb9,0x55,0x15,0x25,0xa1, + 0x40,0x4c,0x66,0x29,0x32,0xb7,0xc9,0x62,0x60,0x88,0xe0,0x99,0x39,0xae,0xce,0x25, + 0x3c,0x11,0x89,0xdd,0xc6,0x14,0xd7,0x3e,0xa3,0x6d,0x07,0x2e,0x56,0xa0,0xff,0x97}, + {0x3c,0x91,0x12,0x4a,0x37,0x7d,0xd6,0x96,0xd2,0x9b,0x5d,0xea,0xb8,0xb9,0x82,0x4e, + 0x4f,0x6b,0x60,0x4c,0x59,0x01,0xe5,0x73,0xfd,0xf6,0xb8,0x9a,0x5a,0xd3,0x7c,0x7a, + 0xd2,0x4f,0x8e,0x74,0xc1,0x90,0x88,0xa0,0x3f,0x55,0x75,0x79,0x10,0xd0,0x09,0x79, + 0x0f,0x6c,0x74,0x0c,0x05,0x08,0x3c,0x8c,0x94,0x7b,0x30,0x56,0xca,0xdf,0xdf,0x34} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0x77,0xcb,0x70,0xbf,0xae,0xd4,0x4c,0x5b,0xbc,0xd3,0xec,0x8a,0x82,0x43,0x8d,0xb3, + 0x7f,0x1f,0xfb,0x70,0x36,0x32,0x4d,0xa6,0xb7,0x13,0x37,0x77,0x30,0x0c,0x3c,0xfb, + 0x2c,0x20,0x8f,0x2a,0xf4,0x47,0x4d,0x69,0x8e,0xae,0x2d,0xad,0xba,0x35,0xe9,0x2f, + 0xe6,0x99,0x7a,0xf8,0xcf,0x70,0x78,0xbb,0x0c,0x72,0x64,0x95,0x8b,0x36,0x77,0x3d}, + {0xc6,0x43,0x17,0x16,0x87,0x09,0x5f,0x12,0xed,0x21,0xe2,0xb4,0xad,0x55,0xa1,0xa1, + 0x49,0x50,0x90,0x70,0xab,0x81,0x83,0x7a,0xcd,0xdf,0x23,0x52,0x19,0xc0,0xa2,0xd8, + 0x8e,0x98,0xeb,0xf0,0x37,0xab,0xad,0xfd,0x1c,0x04,0x97,0x18,0x42,0x85,0xf7,0x4b, + 0x18,0x2c,0x55,0xd3,0xa9,0xe6,0x89,0xfb,0x58,0x0a,0xb2,0x37,0xb9,0xf8,0xfb,0xc5} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xc7,0x34,0x95,0x02,0x5e,0x31,0x0d,0x1f,0x10,0x38,0x9c,0x3f,0x04,0x53,0xed,0x05, + 0x27,0x38,0xc1,0x3f,0x6a,0x0f,0xc5,0xa3,0x9b,0x73,0x8a,0x28,0x7e,0x5d,0x3c,0xdc, + 0x9d,0x5a,0x09,0xbf,0x8c,0x0a,0xad,0xe4,0x73,0x52,0xe3,0x6d,0xaa,0xd1,0x8b,0xbf, + 0xa3,0xb7,0xf0,0x58,0xad,0x22,0x24,0xc9,0xaa,0x96,0xb7,0x5d,0xfc,0x5f,0xb0,0xcf}, + {0x76,0x22,0xfd,0xe8,0xa2,0x79,0x8e,0x9d,0x43,0x8c,0x7a,0xba,0x78,0xb7,0x84,0xf1, + 0xc8,0xee,0x3b,0xae,0x31,0x89,0xbf,0x7e,0xd0,0x4b,0xc1,0x2d,0x58,0x5d,0x84,0x6b, + 0xec,0x86,0x56,0xe0,0x87,0x94,0x7f,0xbc,0xf9,0x48,0x92,0xef,0x54,0x7f,0x23,0x8d, + 0x4f,0x8b,0x0a,0x75,0xa7,0x39,0x0e,0x46,0x6e,0xee,0x58,0xc8,0xfa,0xea,0x90,0x53} + }; + #endif +#elif defined(SCRYPT_KECCAK256) + #if defined(SCRYPT_SALSA) + static const uint8_t post_vectors[][64] = { + {0x2e,0x96,0xd8,0x87,0x45,0xcd,0xd6,0xc8,0xf6,0xd2,0x87,0x33,0x50,0xc7,0x04,0xe5, + 0x3c,0x4b,0x48,0x44,0x57,0xc1,0x74,0x09,0x76,0x02,0xaa,0xd3,0x7b,0xf3,0xbf,0xed, + 0x4b,0x72,0xd7,0x1b,0x49,0x6b,0xe0,0x44,0x83,0xee,0x8f,0xaf,0xa1,0xb5,0x33,0xa9, + 0x9e,0x86,0xab,0xe2,0x9f,0xcf,0x68,0x6e,0x7e,0xbd,0xf5,0x7a,0x83,0x4b,0x1c,0x10}, + {0x42,0x7e,0xf9,0x4b,0x72,0x61,0xda,0x2d,0xb3,0x27,0x0e,0xe1,0xd9,0xde,0x5f,0x3e, + 0x64,0x2f,0xd6,0xda,0x90,0x59,0xce,0xbf,0x02,0x5b,0x32,0xf7,0x6d,0x94,0x51,0x7b, + 0xb6,0xa6,0x0d,0x99,0x3e,0x7f,0x39,0xbe,0x1b,0x1d,0x6c,0x97,0x12,0xd8,0xb7,0xfd, + 0x5b,0xb5,0xf3,0x73,0x5a,0x89,0xb2,0xdd,0xcc,0x3d,0x74,0x2e,0x3d,0x9e,0x3c,0x22} + }; + #elif defined(SCRYPT_CHACHA) + static const uint8_t post_vectors[][64] = { + {0x76,0x1d,0x5b,0x8f,0xa9,0xe1,0xa6,0x01,0xcb,0xc5,0x7a,0x5f,0x02,0x23,0xb6,0x82, + 0x57,0x79,0x60,0x2f,0x05,0x7f,0xb8,0x0a,0xcb,0x5e,0x54,0x11,0x49,0x2e,0xdd,0x85, + 0x83,0x30,0x67,0xb3,0x24,0x5c,0xce,0xfc,0x32,0xcf,0x12,0xc3,0xff,0xe0,0x79,0x36, + 0x74,0x17,0xa6,0x3e,0xcd,0xa0,0x7e,0xcb,0x37,0xeb,0xcb,0xb6,0xe1,0xb9,0xf5,0x15}, + {0xf5,0x66,0xa7,0x4c,0xe4,0xdc,0x18,0x56,0x2f,0x3e,0x86,0x4d,0x92,0xa5,0x5c,0x5a, + 0x8f,0xc3,0x6b,0x32,0xdb,0xe5,0x72,0x50,0x84,0xfc,0x6e,0x5d,0x15,0x77,0x3d,0xca, + 0xc5,0x2b,0x20,0x3c,0x78,0x37,0x80,0x78,0x23,0x56,0x91,0xa0,0xce,0xa4,0x06,0x5a, + 0x7f,0xe3,0xbf,0xab,0x51,0x57,0x32,0x2c,0x0a,0xf0,0xc5,0x6f,0xf4,0xcb,0xff,0x42} + }; + #elif defined(SCRYPT_SALSA64) + static const uint8_t post_vectors[][64] = { + {0xb0,0xb7,0x10,0xb5,0x1f,0x2b,0x7f,0xaf,0x9d,0x95,0x5f,0x4c,0x2d,0x98,0x7c,0xc1, + 0xbc,0x37,0x2f,0x50,0x8d,0xb2,0x9f,0xfd,0x48,0x0d,0xe0,0x44,0x19,0xdf,0x28,0x6c, + 0xab,0xbf,0x1e,0x17,0x26,0xcc,0x57,0x95,0x18,0x17,0x83,0x4c,0x12,0x48,0xd9,0xee, + 0x4b,0x00,0x29,0x06,0x31,0x01,0x6b,0x8c,0x26,0x39,0xbf,0xe4,0xe4,0xd4,0x6a,0x26}, + {0xa0,0x40,0xb2,0xf2,0x11,0xb6,0x5f,0x3d,0x4c,0x1e,0xef,0x59,0xd4,0x98,0xdb,0x14, + 0x01,0xff,0xe3,0x34,0xd7,0x19,0xcd,0xeb,0xde,0x52,0x1c,0xf4,0x86,0x43,0xc9,0xe2, + 0xfb,0xf9,0x4f,0x0a,0xbb,0x1f,0x5c,0x6a,0xdf,0xb9,0x28,0xfa,0xac,0xc4,0x48,0xed, + 0xcc,0xd2,0x2e,0x25,0x5f,0xf3,0x56,0x1d,0x2d,0x23,0x22,0xc1,0xbc,0xff,0x78,0x80} + }; + #endif +#else + static const uint8_t post_vectors[][64] = {{0}}; +#endif + diff --git a/sha2-arm.S b/sha2-arm.S new file mode 100644 index 00000000..7ea307cf --- /dev/null +++ b/sha2-arm.S @@ -0,0 +1,1583 @@ +/* + * Copyright 2012 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" + +#if defined(__arm__) && defined(__APCS_32__) + +.macro sha256_k + .align 2 + .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 + .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc + .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da + .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 + .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 + .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 + .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 + .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 + .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 + .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 + .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 + .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 + .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +.endm + +.macro sha256_extend_doubleround_core i, rw, ra, rb, ry, rz + mov r12, \ry, ror #17 + add r11, r11, \ra + eor r12, r12, \ry, ror #19 + mov \ra, lr, ror #7 + eor r12, r12, \ry, lsr #10 + eor \ra, \ra, lr, ror #18 + add r12, r12, r11 + ldr r11, [\rw, #(\i+2)*4] + eor \ra, \ra, lr, lsr #3 + add \ra, \ra, r12 + + mov r12, \rz, ror #17 + str \ra, [\rw, #(\i+16)*4] + add lr, lr, \rb + eor r12, r12, \rz, ror #19 + mov \rb, r11, ror #7 + eor r12, r12, \rz, lsr #10 + eor \rb, \rb, r11, ror #18 + add lr, lr, r12 + eor \rb, \rb, r11, lsr #3 + add \rb, \rb, lr +.endm + +.macro sha256_extend_doubleround_head i, rw, ra, rb, ry, rz + ldr lr, [\rw, #(\i+1)*4] + sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz + ldr lr, [\rw, #(\i+3)*4] +.endm + +.macro sha256_extend_doubleround_body i, rw, ra, rb, ry, rz + str \rz, [\rw, #(\i+15)*4] + sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz + ldr lr, [\rw, #(\i+3)*4] +.endm + +.macro sha256_extend_doubleround_foot i, rw, ra, rb, ry, rz + str \rz, [\rw, #(\i+15)*4] + sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz + str \rb, [\rw, #(\i+17)*4] +.endm + +.macro sha256_main_round i, ka, rw, ra, rb, rc, rd, re, rf, rg, rh + ldr r12, [\rw, #(\i)*4] + and r3, \rf, \re + bic lr, \rg, \re + orr lr, lr, r3 + ldr r3, \ka + (\i)*4 + add \rh, \rh, lr + eor lr, \re, \re, ror #5 + add \rh, \rh, r12 + eor lr, lr, \re, ror #19 + add \rh, \rh, r3 + eor r3, \ra, \rb + add \rh, \rh, lr, ror #6 + + and r3, r3, \rc + eor r12, \ra, \ra, ror #11 + and lr, \ra, \rb + eor r12, r12, \ra, ror #20 + eor lr, lr, r3 + add r3, \rh, lr + add \rh, \rh, \rd + add \rd, r3, r12, ror #2 +.endm + +.macro sha256_main_quadround i, ka, rw + sha256_main_round \i+0, \ka, \rw, r4, r5, r6, r7, r8, r9, r10, r11 + sha256_main_round \i+1, \ka, \rw, r7, r4, r5, r6, r11, r8, r9, r10 + sha256_main_round \i+2, \ka, \rw, r6, r7, r4, r5, r10, r11, r8, r9 + sha256_main_round \i+3, \ka, \rw, r5, r6, r7, r4, r9, r10, r11, r8 +.endm + + + .text + .code 32 + .align 2 + .globl sha256_transform + .globl _sha256_transform +#ifdef __ELF__ + .type sha256_transform, %function +#endif +sha256_transform: +_sha256_transform: + stmfd sp!, {r4-r11, lr} + cmp r2, #0 + sub sp, sp, #64*4 + bne sha256_transform_swap + + ldmia r1!, {r4-r11} + stmia sp, {r4-r11} + add r3, sp, #8*4 + ldmia r1, {r4-r11} + stmia r3, {r4-r11} + b sha256_transform_extend + +.macro bswap rd, rn + eor r12, \rn, \rn, ror #16 + bic r12, r12, #0x00ff0000 + mov \rd, \rn, ror #8 + eor \rd, \rd, r12, lsr #8 +.endm + +sha256_transform_swap: + ldmia r1!, {r4-r11} + bswap r4, r4 + bswap r5, r5 + bswap r6, r6 + bswap r7, r7 + bswap r8, r8 + bswap r9, r9 + bswap r10, r10 + bswap r11, r11 + stmia sp, {r4-r11} + add r3, sp, #8*4 + ldmia r1, {r4-r11} + bswap r4, r4 + bswap r5, r5 + bswap r6, r6 + bswap r7, r7 + bswap r8, r8 + bswap r9, r9 + bswap r10, r10 + bswap r11, r11 + stmia r3, {r4-r11} + +sha256_transform_extend: + add r12, sp, #9*4 + ldr r11, [sp, #0*4] + ldmia r12, {r4-r10} + sha256_extend_doubleround_head 0, sp, r4, r5, r9, r10 + sha256_extend_doubleround_body 2, sp, r6, r7, r4, r5 + sha256_extend_doubleround_body 4, sp, r8, r9, r6, r7 + sha256_extend_doubleround_body 6, sp, r10, r4, r8, r9 + sha256_extend_doubleround_body 8, sp, r5, r6, r10, r4 + sha256_extend_doubleround_body 10, sp, r7, r8, r5, r6 + sha256_extend_doubleround_body 12, sp, r9, r10, r7, r8 + sha256_extend_doubleround_body 14, sp, r4, r5, r9, r10 + sha256_extend_doubleround_body 16, sp, r6, r7, r4, r5 + sha256_extend_doubleround_body 18, sp, r8, r9, r6, r7 + sha256_extend_doubleround_body 20, sp, r10, r4, r8, r9 + sha256_extend_doubleround_body 22, sp, r5, r6, r10, r4 + sha256_extend_doubleround_body 24, sp, r7, r8, r5, r6 + sha256_extend_doubleround_body 26, sp, r9, r10, r7, r8 + sha256_extend_doubleround_body 28, sp, r4, r5, r9, r10 + sha256_extend_doubleround_body 30, sp, r6, r7, r4, r5 + sha256_extend_doubleround_body 32, sp, r8, r9, r6, r7 + sha256_extend_doubleround_body 34, sp, r10, r4, r8, r9 + sha256_extend_doubleround_body 36, sp, r5, r6, r10, r4 + sha256_extend_doubleround_body 38, sp, r7, r8, r5, r6 + sha256_extend_doubleround_body 40, sp, r9, r10, r7, r8 + sha256_extend_doubleround_body 42, sp, r4, r5, r9, r10 + sha256_extend_doubleround_body 44, sp, r6, r7, r4, r5 + sha256_extend_doubleround_foot 46, sp, r8, r9, r6, r7 + + ldmia r0, {r4-r11} + sha256_main_quadround 0, sha256_transform_k, sp + sha256_main_quadround 4, sha256_transform_k, sp + sha256_main_quadround 8, sha256_transform_k, sp + sha256_main_quadround 12, sha256_transform_k, sp + sha256_main_quadround 16, sha256_transform_k, sp + sha256_main_quadround 20, sha256_transform_k, sp + sha256_main_quadround 24, sha256_transform_k, sp + sha256_main_quadround 28, sha256_transform_k, sp + b sha256_transform_k_over +sha256_transform_k: + sha256_k +sha256_transform_k_over: + sha256_main_quadround 32, sha256_transform_k, sp + sha256_main_quadround 36, sha256_transform_k, sp + sha256_main_quadround 40, sha256_transform_k, sp + sha256_main_quadround 44, sha256_transform_k, sp + sha256_main_quadround 48, sha256_transform_k, sp + sha256_main_quadround 52, sha256_transform_k, sp + sha256_main_quadround 56, sha256_transform_k, sp + sha256_main_quadround 60, sha256_transform_k, sp + + ldmia r0, {r1, r2, r3, r12} + add r4, r4, r1 + add r5, r5, r2 + add r6, r6, r3 + add r7, r7, r12 + stmia r0!, {r4-r7} + ldmia r0, {r1, r2, r3, r12} + add r8, r8, r1 + add r9, r9, r2 + add r10, r10, r3 + add r11, r11, r12 + stmia r0, {r8-r11} + + add sp, sp, #64*4 +#ifdef __thumb__ + ldmfd sp!, {r4-r11, lr} + bx lr +#else + ldmfd sp!, {r4-r11, pc} +#endif + + + .text + .code 32 + .align 2 + .globl sha256d_ms + .globl _sha256d_ms +#ifdef __ELF__ + .type sha256d_ms, %function +#endif +sha256d_ms: +_sha256d_ms: + stmfd sp!, {r4-r11, lr} + sub sp, sp, #64*4 + + cmp r0, r0 + + ldr lr, [r1, #3*4] + ldr r6, [r1, #18*4] + ldr r7, [r1, #19*4] + + mov r12, lr, ror #7 + str r6, [sp, #18*4] + eor r12, r12, lr, ror #18 + str r7, [sp, #19*4] + eor r12, r12, lr, lsr #3 + ldr r8, [r1, #20*4] + add r6, r6, r12 + ldr r10, [r1, #22*4] + add r7, r7, lr + str r6, [r1, #18*4] + + mov r12, r6, ror #17 + str r7, [r1, #19*4] + eor r12, r12, r6, ror #19 + str r8, [sp, #20*4] + eor r12, r12, r6, lsr #10 + ldr r4, [r1, #23*4] + add r8, r8, r12 + ldr r5, [r1, #24*4] + + mov r9, r7, ror #17 + str r8, [r1, #20*4] + eor r9, r9, r7, ror #19 + str r10, [sp, #21*4] + eor r9, r9, r7, lsr #10 + str r4, [sp, #22*4] + + mov r12, r8, ror #17 + str r9, [r1, #21*4] + eor r12, r12, r8, ror #19 + str r5, [sp, #23*4] + eor r12, r12, r8, lsr #10 + mov lr, r9, ror #17 + add r10, r10, r12 + ldr r11, [r1, #30*4] + + eor lr, lr, r9, ror #19 + str r10, [r1, #22*4] + eor lr, lr, r9, lsr #10 + str r11, [sp, #24*4] + add r4, r4, lr + + mov r12, r10, ror #17 + str r4, [r1, #23*4] + eor r12, r12, r10, ror #19 + mov lr, r4, ror #17 + eor r12, r12, r10, lsr #10 + eor lr, lr, r4, ror #19 + add r5, r5, r12 + eor lr, lr, r4, lsr #10 + str r5, [r1, #24*4] + add r6, r6, lr + + mov r12, r5, ror #17 + str r6, [r1, #25*4] + eor r12, r12, r5, ror #19 + mov lr, r6, ror #17 + eor r12, r12, r5, lsr #10 + eor lr, lr, r6, ror #19 + add r7, r7, r12 + eor lr, lr, r6, lsr #10 + str r7, [r1, #26*4] + add r8, r8, lr + + mov r12, r7, ror #17 + str r8, [r1, #27*4] + eor r12, r12, r7, ror #19 + mov lr, r8, ror #17 + eor r12, r12, r7, lsr #10 + eor lr, lr, r8, ror #19 + add r9, r9, r12 + eor lr, lr, r8, lsr #10 + str r9, [r1, #28*4] + add r10, r10, lr + + ldr lr, [r1, #31*4] + mov r12, r9, ror #17 + str r10, [r1, #29*4] + eor r12, r12, r9, ror #19 + str lr, [sp, #25*4] + eor r12, r12, r9, lsr #10 + add r11, r11, r12 + add r5, r5, lr + mov r12, r10, ror #17 + add r4, r4, r11 + + ldr r11, [r1, #16*4] + eor r12, r12, r10, ror #19 + str r4, [r1, #30*4] + eor r12, r12, r10, lsr #10 + add r5, r5, r12 + ldr lr, [r1, #17*4] + +sha256d_ms_extend_loop2: + sha256_extend_doubleround_body 16, r1, r6, r7, r4, r5 + sha256_extend_doubleround_body 18, r1, r8, r9, r6, r7 + sha256_extend_doubleround_body 20, r1, r10, r4, r8, r9 + sha256_extend_doubleround_body 22, r1, r5, r6, r10, r4 + sha256_extend_doubleround_body 24, r1, r7, r8, r5, r6 + sha256_extend_doubleround_body 26, r1, r9, r10, r7, r8 + sha256_extend_doubleround_body 28, r1, r4, r5, r9, r10 + sha256_extend_doubleround_body 30, r1, r6, r7, r4, r5 + sha256_extend_doubleround_body 32, r1, r8, r9, r6, r7 + sha256_extend_doubleround_body 34, r1, r10, r4, r8, r9 + sha256_extend_doubleround_body 36, r1, r5, r6, r10, r4 + sha256_extend_doubleround_body 38, r1, r7, r8, r5, r6 + sha256_extend_doubleround_body 40, r1, r9, r10, r7, r8 + sha256_extend_doubleround_body 42, r1, r4, r5, r9, r10 + bne sha256d_ms_extend_coda2 + sha256_extend_doubleround_body 44, r1, r6, r7, r4, r5 + sha256_extend_doubleround_foot 46, r1, r8, r9, r6, r7 + + ldr r4, [r3, #0*4] + ldr r9, [r3, #1*4] + ldr r10, [r3, #2*4] + ldr r11, [r3, #3*4] + ldr r8, [r3, #4*4] + ldr r5, [r3, #5*4] + ldr r6, [r3, #6*4] + ldr r7, [r3, #7*4] + b sha256d_ms_main_loop1 + +sha256d_ms_main_loop2: + sha256_main_round 0, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11 + sha256_main_round 1, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10 + sha256_main_round 2, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9 +sha256d_ms_main_loop1: + sha256_main_round 3, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8 + sha256_main_quadround 4, sha256d_ms_k, r1 + sha256_main_quadround 8, sha256d_ms_k, r1 + sha256_main_quadround 12, sha256d_ms_k, r1 + sha256_main_quadround 16, sha256d_ms_k, r1 + sha256_main_quadround 20, sha256d_ms_k, r1 + sha256_main_quadround 24, sha256d_ms_k, r1 + sha256_main_quadround 28, sha256d_ms_k, r1 + b sha256d_ms_k_over +sha256d_ms_k: + sha256_k +sha256d_ms_k_over: + sha256_main_quadround 32, sha256d_ms_k, r1 + sha256_main_quadround 36, sha256d_ms_k, r1 + sha256_main_quadround 40, sha256d_ms_k, r1 + sha256_main_quadround 44, sha256d_ms_k, r1 + sha256_main_quadround 48, sha256d_ms_k, r1 + sha256_main_quadround 52, sha256d_ms_k, r1 + sha256_main_round 56, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11 + bne sha256d_ms_finish + sha256_main_round 57, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10 + sha256_main_round 58, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9 + sha256_main_round 59, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8 + sha256_main_quadround 60, sha256d_ms_k, r1 + + ldmia r2!, {r3, r12, lr} + add r4, r4, r3 + add r5, r5, r12 + add r6, r6, lr + stmia sp, {r4-r6} + ldmia r2, {r3, r4, r5, r6, r12} + add lr, sp, #3*4 + add r7, r7, r3 + add r8, r8, r4 + add r9, r9, r5 + add r10, r10, r6 + add r11, r11, r12 + add r12, sp, #18*4 + stmia lr!, {r7-r11} + + ldmia r12, {r4-r11} + str r4, [r1, #18*4] + str r5, [r1, #19*4] + str r6, [r1, #20*4] + str r7, [r1, #22*4] + str r8, [r1, #23*4] + str r9, [r1, #24*4] + str r10, [r1, #30*4] + str r11, [r1, #31*4] + + mov r3, #0x80000000 + mov r4, #0 + mov r5, #0 + mov r6, #0 + mov r7, #0 + mov r8, #0 + mov r9, #0 + mov r10, #0x00000100 + stmia lr, {r3-r10} + + ldr lr, [sp, #1*4] + movs r1, sp + ldr r4, [sp, #0*4] + + ldr r11, [sp, #2*4] + mov r12, lr, ror #7 + eor r12, r12, lr, ror #18 + add r5, lr, #0x00a00000 + eor r12, r12, lr, lsr #3 + mov lr, r11, ror #7 + add r4, r4, r12 + eor lr, lr, r11, ror #18 + str r4, [sp, #16*4] + eor lr, lr, r11, lsr #3 + mov r12, r4, ror #17 + add r5, r5, lr + ldr lr, [sp, #3*4] + + str r5, [sp, #17*4] + eor r12, r12, r4, ror #19 + mov r6, lr, ror #7 + eor r12, r12, r4, lsr #10 + eor r6, r6, lr, ror #18 + add r11, r11, r12 + eor r6, r6, lr, lsr #3 + mov r12, r5, ror #17 + add r6, r6, r11 + ldr r11, [sp, #4*4] + + str r6, [sp, #18*4] + eor r12, r12, r5, ror #19 + mov r7, r11, ror #7 + eor r12, r12, r5, lsr #10 + eor r7, r7, r11, ror #18 + add lr, lr, r12 + eor r7, r7, r11, lsr #3 + mov r12, r6, ror #17 + add r7, r7, lr + ldr lr, [sp, #5*4] + + str r7, [sp, #19*4] + eor r12, r12, r6, ror #19 + mov r8, lr, ror #7 + eor r12, r12, r6, lsr #10 + eor r8, r8, lr, ror #18 + add r11, r11, r12 + eor r8, r8, lr, lsr #3 + mov r12, r7, ror #17 + add r8, r8, r11 + ldr r11, [sp, #6*4] + + str r8, [sp, #20*4] + eor r12, r12, r7, ror #19 + mov r9, r11, ror #7 + eor r12, r12, r7, lsr #10 + eor r9, r9, r11, ror #18 + add lr, lr, r12 + eor r9, r9, r11, lsr #3 + mov r12, r8, ror #17 + add r9, r9, lr + ldr lr, [sp, #7*4] + + str r9, [sp, #21*4] + eor r12, r12, r8, ror #19 + mov r10, lr, ror #7 + eor r12, r12, r8, lsr #10 + eor r10, r10, lr, ror #18 + add r11, r11, r12 + eor r10, r10, lr, lsr #3 + mov r12, r9, ror #17 + add r11, r11, #0x00000100 + add lr, lr, r4 + add r10, r10, r11 + + eor r12, r12, r9, ror #19 + str r10, [sp, #22*4] + add lr, lr, #0x11000000 + eor r12, r12, r9, lsr #10 + add lr, lr, r12 + mov r12, r10, ror #17 + add r4, lr, #0x00002000 + eor r12, r12, r10, ror #19 + str r4, [sp, #23*4] + add r5, r5, #0x80000000 + eor r12, r12, r10, lsr #10 + add r5, r5, r12 + + mov r12, r4, ror #17 + str r5, [sp, #24*4] + eor r12, r12, r4, ror #19 + mov r11, r5, ror #17 + eor r12, r12, r4, lsr #10 + eor r11, r11, r5, ror #19 + add r6, r6, r12 + eor r11, r11, r5, lsr #10 + str r6, [sp, #25*4] + add r7, r7, r11 + + mov r12, r6, ror #17 + str r7, [sp, #26*4] + eor r12, r12, r6, ror #19 + mov r11, r7, ror #17 + eor r12, r12, r6, lsr #10 + eor r11, r11, r7, ror #19 + add r8, r8, r12 + eor r11, r11, r7, lsr #10 + str r8, [sp, #27*4] + add r9, r9, r11 + + mov lr, r8, ror #17 + mov r12, r9, ror #17 + str r9, [sp, #28*4] + add r4, r4, #0x00400000 + eor lr, lr, r8, ror #19 + eor r12, r12, r9, ror #19 + eor lr, lr, r8, lsr #10 + eor r12, r12, r9, lsr #10 + add r4, r4, #0x00000022 + add r10, r10, lr + add r4, r4, r12 + ldr r11, [sp, #16*4] + + add r5, r5, #0x00000100 + str r4, [sp, #30*4] + mov lr, r11, ror #7 + str r10, [sp, #29*4] + mov r12, r10, ror #17 + eor lr, lr, r11, ror #18 + eor r12, r12, r10, ror #19 + eor lr, lr, r11, lsr #3 + eor r12, r12, r10, lsr #10 + add r5, r5, lr + ldr lr, [r1, #17*4] + add r5, r5, r12 + + b sha256d_ms_extend_loop2 + +sha256d_ms_extend_coda2: + str r5, [r1, #(44+15)*4] + mov r12, r4, ror #17 + add r11, r11, r6 + mov r6, lr, ror #7 + eor r12, r12, r4, ror #19 + eor r6, r6, lr, ror #18 + eor r12, r12, r4, lsr #10 + eor r6, r6, lr, lsr #3 + add r12, r12, r11 + add r6, r6, r12 + str r6, [r1, #(44+16)*4] + + adr r2, sha256d_ms_h + ldmia r2, {r4-r11} + b sha256d_ms_main_loop2 + +sha256d_ms_h: + .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a + .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 + +.macro sha256_main_round_red i, ka, rw, rd, re, rf, rg, rh + ldr r12, [\rw, #(\i)*4] + and r3, \rf, \re + bic lr, \rg, \re + add \rh, \rh, \rd + orr lr, lr, r3 + ldr r3, \ka + (\i)*4 + add \rh, \rh, lr + eor lr, \re, \re, ror #5 + add \rh, \rh, r12 + eor lr, lr, \re, ror #19 + add \rh, \rh, r3 + add \rh, \rh, lr, ror #6 +.endm + +sha256d_ms_finish: + sha256_main_round_red 57, sha256d_ms_k, r1, r6, r11, r8, r9, r10 + sha256_main_round_red 58, sha256d_ms_k, r1, r5, r10, r11, r8, r9 + sha256_main_round_red 59, sha256d_ms_k, r1, r4, r9, r10, r11, r8 + ldr r5, [r2, #7*4] + sha256_main_round_red 60, sha256d_ms_k, r1, r7, r8, r9, r10, r11 + + add r11, r11, r5 + str r11, [r0, #7*4] + + add sp, sp, #64*4 +#ifdef __thumb__ + ldmfd sp!, {r4-r11, lr} + bx lr +#else + ldmfd sp!, {r4-r11, pc} +#endif + + +#ifdef __ARM_NEON__ + + .text + .code 32 + .align 2 + .globl sha256_init_4way + .globl _sha256_init_4way +#ifdef __ELF__ + .type sha256_init_4way, %function +#endif +sha256_init_4way: +_sha256_init_4way: + adr r12, sha256_4h + vldmia r12, {q8-q15} + vstmia r0, {q8-q15} + bx lr + .align 4 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + +.macro sha256_4k + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 +.endm + +.macro sha256_4way_extend_doubleround_core i, rr, rw, ra, rb, ry, rz + vadd.u32 q5, q5, \ra + veor.u32 q4, q4, q0 + vshr.u32 q0, \ry, #19 + vshl.u32 q1, \ry, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 \ra, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 \ra, \ra, q0 + vshr.u32 q1, \ry, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 \ra, \ra, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 \ra, \ra, q1 + vadd.u32 q4, q4, q5 + veor.u32 \ra, \ra, q0 + vld1.u32 {q5}, [\rr]! + vadd.u32 \ra, \ra, q4 + + vshr.u32 q4, \rz, #17 + vshl.u32 q0, \rz, #32-17 + vadd.u32 q6, q6, \rb + vst1.u32 {\ra}, [\rw]! + veor.u32 q4, q4, q0 + vshr.u32 q0, \rz, #19 + vshl.u32 q1, \rz, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 \rb, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, \rz, #10 + veor.u32 \rb, \rb, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 \rb, \rb, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 \rb, \rb, q1 + vadd.u32 q1, q6, q4 + veor.u32 \rb, \rb, q0 +.endm + +.macro sha256_4way_extend_doubleround_head i, rr, rw, ra, rb, ry, rz + vld1.u32 {q6}, [\rr]! + vshr.u32 q4, \ry, #17 + vshl.u32 q0, \ry, #32-17 + sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz + vld1.u32 {q6}, [\rr]! + vadd.u32 \rb, \rb, q1 +.endm + +.macro sha256_4way_extend_doubleround_body i, rr, rw, ra, rb, ry, rz + vshr.u32 q4, \ry, #17 + vshl.u32 q0, \ry, #32-17 + vst1.u32 {\rz}, [\rw]! + sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz + vld1.u32 {q6}, [\rr]! + vadd.u32 \rb, \rb, q1 +.endm + +.macro sha256_4way_extend_doubleround_foot i, rr, rw, ra, rb, ry, rz + vshr.u32 q4, \ry, #17 + vshl.u32 q0, \ry, #32-17 + vst1.u32 {\rz}, [\rw]! + sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz + vadd.u32 \rb, \rb, q1 + vst1.u32 {\rb}, [\rw]! +.endm + +.macro sha256_4way_main_round i, rk, rw, ra, rb, rc, rd, re, rf, rg, rh + vld1.u32 {q8}, [\rw]! + vand.u32 q9, \rf, \re + vbic.u32 q10, \rg, \re + vshr.u32 q11, \re, #5 + vorr.u32 q10, q10, q9 + vld1.u32 {q9}, [\rk]! + vadd.u32 \rh, \rh, q10 + vshl.u32 q12, \re, #32-5 + veor.u32 q10, \re, q11 + vshr.u32 q11, \re, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, \re, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 \rh, \rh, q8 + veor.u32 q10, q10, q12 + vadd.u32 \rh, \rh, q9 + veor.u32 q9, \ra, \rb + vshr.u32 q11, q10, #6 + vshl.u32 q13, q10, #32-6 + vadd.u32 \rh, \rh, q11 + + vshr.u32 q11, \ra, #11 + vshl.u32 q12, \ra, #32-11 + veor.u32 q8, \ra, q11 + vand.u32 q10, \ra, \rb + veor.u32 q8, q8, q12 + vshr.u32 q11, \ra, #20 + vshl.u32 q12, \ra, #32-20 + veor.u32 q8, q8, q11 + vand.u32 q9, q9, \rc + veor.u32 q8, q8, q12 + vadd.u32 \rh, \rh, q13 + veor.u32 q10, q10, q9 + vshr.u32 q11, q8, #2 + vshl.u32 q12, q8, #32-2 + vadd.u32 q9, \rh, q10 + vadd.u32 q12, q12, q11 + vadd.u32 \rh, \rh, \rd + vadd.u32 \rd, q9, q12 +.endm + +.macro sha256_4way_main_quadround i, rk, rw + sha256_4way_main_round \i+0, \rk, \rw, q0, q1, q2, q3, q4, q5, q6, q7 + sha256_4way_main_round \i+1, \rk, \rw, q3, q0, q1, q2, q7, q4, q5, q6 + sha256_4way_main_round \i+2, \rk, \rw, q2, q3, q0, q1, q6, q7, q4, q5 + sha256_4way_main_round \i+3, \rk, \rw, q1, q2, q3, q0, q5, q6, q7, q4 +.endm + + + .text + .code 32 + .align 2 + .globl sha256_transform_4way + .globl _sha256_transform_4way +#ifdef __ELF__ + .type sha256_transform_4way, %function +#endif +sha256_transform_4way: +_sha256_transform_4way: + stmfd sp!, {r4, lr} + vpush {q4-q7} + mov r12, sp + sub sp, sp, #64*16 + bic sp, sp, #63 + cmp r2, #0 + bne sha256_transform_4way_swap + + vldmia r1!, {q0-q7} + vstmia sp, {q0-q7} + add r3, sp, #8*16 + vldmia r1, {q8-q15} + vstmia r3, {q8-q15} + b sha256_transform_4way_extend + +sha256_transform_4way_swap: + vldmia r1!, {q0-q7} + vrev32.8 q0, q0 + vrev32.8 q1, q1 + vrev32.8 q2, q2 + vrev32.8 q3, q3 + vldmia r1, {q8-q15} + vrev32.8 q4, q4 + vrev32.8 q5, q5 + vrev32.8 q6, q6 + vrev32.8 q7, q7 + vstmia sp, {q0-q7} + vrev32.8 q8, q8 + vrev32.8 q9, q9 + vrev32.8 q10, q10 + vrev32.8 q11, q11 + vrev32.8 q12, q12 + vrev32.8 q13, q13 + vrev32.8 q14, q14 + vrev32.8 q15, q15 + add r3, sp, #8*16 + vstmia r3, {q8-q15} + +sha256_transform_4way_extend: + add r1, sp, #1*16 + add r2, sp, #16*16 + vmov.u32 q5, q0 + sha256_4way_extend_doubleround_head 0, r1, r2, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 2, r1, r2, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 4, r1, r2, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 6, r1, r2, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 8, r1, r2, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 10, r1, r2, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 12, r1, r2, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 14, r1, r2, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 16, r1, r2, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 18, r1, r2, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 20, r1, r2, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 22, r1, r2, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 24, r1, r2, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 26, r1, r2, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 28, r1, r2, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 30, r1, r2, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 32, r1, r2, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 34, r1, r2, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 36, r1, r2, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 38, r1, r2, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 40, r1, r2, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 42, r1, r2, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 44, r1, r2, q11, q12, q9, q10 + sha256_4way_extend_doubleround_foot 46, r1, r2, q13, q14, q11, q12 + + vldmia r0, {q0-q7} + adr r4, sha256_transform_4way_4k + b sha256_transform_4way_4k_over + .align 4 +sha256_transform_4way_4k: + sha256_4k +sha256_transform_4way_4k_over: + sha256_4way_main_quadround 0, r4, sp + sha256_4way_main_quadround 4, r4, sp + sha256_4way_main_quadround 8, r4, sp + sha256_4way_main_quadround 12, r4, sp + sha256_4way_main_quadround 16, r4, sp + sha256_4way_main_quadround 20, r4, sp + sha256_4way_main_quadround 24, r4, sp + sha256_4way_main_quadround 28, r4, sp + sha256_4way_main_quadround 32, r4, sp + sha256_4way_main_quadround 36, r4, sp + sha256_4way_main_quadround 40, r4, sp + sha256_4way_main_quadround 44, r4, sp + sha256_4way_main_quadround 48, r4, sp + sha256_4way_main_quadround 52, r4, sp + sha256_4way_main_quadround 56, r4, sp + sha256_4way_main_quadround 60, r4, sp + + vldmia r0, {q8-q15} + vadd.u32 q0, q0, q8 + vadd.u32 q1, q1, q9 + vadd.u32 q2, q2, q10 + vadd.u32 q3, q3, q11 + vadd.u32 q4, q4, q12 + vadd.u32 q5, q5, q13 + vadd.u32 q6, q6, q14 + vadd.u32 q7, q7, q15 + vstmia r0, {q0-q7} + + mov sp, r12 + vpop {q4-q7} + ldmfd sp!, {r4, pc} + + + .text + .code 32 + .align 2 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way +#ifdef __ELF__ + .type sha256d_ms_4way, %function +#endif +sha256d_ms_4way: +_sha256d_ms_4way: + stmfd sp!, {r4, lr} + vpush {q4-q7} + mov r12, sp + sub sp, sp, #64*16 + bic sp, sp, #63 + + add r4, r1, #3*16 + vld1.u32 {q6}, [r4]! + add r1, r1, #18*16 + vldmia r1, {q11-q13} + cmp r0, r0 + + vshr.u32 q10, q6, #7 + vshl.u32 q0, q6, #32-7 + vshr.u32 q1, q6, #18 + veor.u32 q10, q10, q0 + vshl.u32 q0, q6, #32-18 + veor.u32 q10, q10, q1 + vshr.u32 q1, q6, #3 + veor.u32 q10, q10, q0 + vstmia sp!, {q11-q13} + veor.u32 q4, q10, q1 + vadd.u32 q12, q12, q6 + vadd.u32 q11, q11, q4 + + vshr.u32 q14, q12, #17 + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q12}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q11, #10 + vshl.u32 q0, q12, #32-17 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vadd.u32 q13, q13, q4 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q14, q14, q0 + vst1.u32 {q13}, [r1]! + veor.u32 q14, q14, q1 + vshr.u32 q1, q12, #10 + + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + veor.u32 q14, q14, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q14}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q13, #10 + vld1.u32 {q15}, [r1] + veor.u32 q4, q4, q1 + vst1.u32 {q15}, [sp]! + vadd.u32 q15, q15, q4 + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q0 + vld1.u32 {q9}, [r1] + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vst1.u32 {q9}, [sp]! + veor.u32 q5, q4, q1 + + vshr.u32 q4, q15, #17 + vadd.u32 q9, q9, q5 + vshl.u32 q0, q15, #32-17 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vst1.u32 {q9}, [r1]! + veor.u32 q4, q4, q0 + vld1.u32 {q10}, [r1] + veor.u32 q4, q4, q1 + vshr.u32 q1, q15, #10 + vst1.u32 {q10}, [sp]! + veor.u32 q4, q4, q1 + vshl.u32 q0, q9, #32-17 + vadd.u32 q10, q10, q4 + vshr.u32 q4, q9, #17 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + veor.u32 q4, q4, q1 + vshr.u32 q1, q9, #10 + veor.u32 q4, q4, q0 + vst1.u32 {q10}, [r1]! + veor.u32 q5, q4, q1 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q11, q11, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q10, #10 + vshl.u32 q0, q11, #32-17 + veor.u32 q2, q4, q1 + vshr.u32 q4, q11, #17 + vadd.u32 q12, q12, q2 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + veor.u32 q4, q4, q1 + vshr.u32 q1, q11, #10 + veor.u32 q4, q4, q0 + vst1.u32 {q12}, [r1]! + veor.u32 q5, q4, q1 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q13, q13, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q13}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q12, #10 + vshl.u32 q0, q13, #32-17 + veor.u32 q2, q4, q1 + vshr.u32 q4, q13, #17 + vadd.u32 q14, q14, q2 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + veor.u32 q4, q4, q1 + vshr.u32 q1, q13, #10 + veor.u32 q4, q4, q0 + vst1.u32 {q14}, [r1]! + veor.u32 q5, q4, q1 + add r4, r4, #12*16 + + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vadd.u32 q15, q15, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vld1.u32 {q2}, [r1] + veor.u32 q4, q4, q1 + vshl.u32 q0, q15, #32-17 + vadd.u32 q9, q9, q4 + vst1.u32 {q2}, [sp]! + vadd.u32 q9, q9, q2 + vshr.u32 q4, q15, #17 + vshr.u32 q2, q15, #19 + veor.u32 q4, q4, q0 + vst1.u32 {q9}, [r1]! + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q2 + vshr.u32 q0, q15, #10 + veor.u32 q4, q4, q1 + vld1.u32 {q5-q6}, [r4]! + veor.u32 q4, q4, q0 + vld1.u32 {q2}, [r1] + vadd.u32 q10, q10, q4 + vst1.u32 {q2}, [sp]! + vadd.u32 q10, q10, q2 + + sub sp, sp, #8*16 + +sha256d_ms_4way_extend_loop2: + sha256_4way_extend_doubleround_body 16, r4, r1, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 18, r4, r1, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 20, r4, r1, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 22, r4, r1, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 24, r4, r1, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 26, r4, r1, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 28, r4, r1, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 30, r4, r1, q11, q12, q9, q10 + sha256_4way_extend_doubleround_body 32, r4, r1, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body 34, r4, r1, q15, q9, q13, q14 + sha256_4way_extend_doubleround_body 36, r4, r1, q10, q11, q15, q9 + sha256_4way_extend_doubleround_body 38, r4, r1, q12, q13, q10, q11 + sha256_4way_extend_doubleround_body 40, r4, r1, q14, q15, q12, q13 + sha256_4way_extend_doubleround_body 42, r4, r1, q9, q10, q14, q15 + sha256_4way_extend_doubleround_body 44, r4, r1, q11, q12, q9, q10 + sha256_4way_extend_doubleround_foot 46, r4, r1, q13, q14, q11, q12 + bne sha256d_ms_4way_extend_coda2 + + vldmia r3!, {q4-q7} + vldmia r3, {q0-q3} + vswp q0, q4 + adr r3, sha256d_ms_4way_4k+3*16 + sub r1, r1, #(64-3)*16 + b sha256d_ms_4way_main_loop1 + + .align 4 +sha256d_ms_4way_4k: + sha256_4k + +sha256d_ms_4way_main_loop2: + sha256_4way_main_round 0, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7 + sha256_4way_main_round 1, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6 + sha256_4way_main_round 2, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5 +sha256d_ms_4way_main_loop1: + sha256_4way_main_round 3, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4 + sha256_4way_main_quadround 4, r3, r1 + sha256_4way_main_quadround 8, r3, r1 + sha256_4way_main_quadround 12, r3, r1 + sha256_4way_main_quadround 16, r3, r1 + sha256_4way_main_quadround 20, r3, r1 + sha256_4way_main_quadround 24, r3, r1 + sha256_4way_main_quadround 28, r3, r1 + sha256_4way_main_quadround 32, r3, r1 + sha256_4way_main_quadround 36, r3, r1 + sha256_4way_main_quadround 40, r3, r1 + sha256_4way_main_quadround 44, r3, r1 + sha256_4way_main_quadround 48, r3, r1 + sha256_4way_main_quadround 52, r3, r1 + sha256_4way_main_round 56, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7 + bne sha256d_ms_4way_finish + sha256_4way_main_round 57, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6 + sha256_4way_main_round 58, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5 + sha256_4way_main_round 59, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4 + sha256_4way_main_quadround 60, r3, r1 + + vldmia r2, {q8-q15} + vadd.u32 q0, q0, q8 + vadd.u32 q1, q1, q9 + vadd.u32 q2, q2, q10 + vadd.u32 q3, q3, q11 + vadd.u32 q4, q4, q12 + vadd.u32 q5, q5, q13 + vadd.u32 q6, q6, q14 + vadd.u32 q7, q7, q15 + + vldmia sp, {q8-q15} + sub r1, r1, #(64-18)*16 + vstmia r1, {q8-q10} + add r1, r1, #4*16 + vstmia r1, {q11-q13} + add r1, r1, #8*16 + vstmia r1, {q14-q15} + + vstmia sp, {q0-q7} + vmov.u32 q8, #0x80000000 + vmov.u32 q9, #0 + vmov.u32 q10, #0 + vmov.u32 q11, #0 + vmov.u32 q12, #0 + vmov.u32 q13, #0 + vmov.u32 q14, #0 + vmov.u32 q15, #0x00000100 + add r1, sp, #8*16 + vstmia r1!, {q8-q15} + adds r4, sp, #2*16 + + vshr.u32 q9, q1, #7 + vshl.u32 q2, q1, #32-7 + vshr.u32 q4, q1, #18 + veor.u32 q9, q9, q2 + vshl.u32 q3, q1, #32-18 + veor.u32 q9, q9, q4 + vshr.u32 q2, q1, #3 + veor.u32 q9, q9, q3 + vld1.u32 {q5}, [r4]! + veor.u32 q9, q9, q2 + vmov.u32 q7, #0x00a00000 + vadd.u32 q9, q9, q0 + vshr.u32 q10, q5, #7 + vshl.u32 q0, q5, #32-7 + vshl.u32 q3, q5, #32-18 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #18 + veor.u32 q10, q10, q3 + vst1.u32 {q9}, [r1]! + vadd.u32 q3, q1, q7 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #3 + vld1.u32 {q6}, [r4]! + veor.u32 q10, q10, q0 + + vshr.u32 q4, q9, #17 + vshl.u32 q0, q9, #32-17 + vadd.u32 q10, q10, q3 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q11, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshr.u32 q1, q9, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q11, q11, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q11, q11, q1 + vadd.u32 q4, q4, q5 + veor.u32 q11, q11, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q11, q11, q4 + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vst1.u32 {q10}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q12, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q10, #10 + veor.u32 q12, q12, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q12, q12, q0 + vshl.u32 q1, q5, #32-18 + vst1.u32 {q11}, [r1]! + veor.u32 q12, q12, q1 + vshr.u32 q0, q5, #3 + vadd.u32 q1, q6, q4 + veor.u32 q12, q12, q0 + + vshr.u32 q4, q11, #17 + vshl.u32 q0, q11, #32-17 + vadd.u32 q12, q12, q1 + vld1.u32 {q6}, [r4]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q13, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshr.u32 q1, q11, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q13, q13, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q13, q13, q1 + vadd.u32 q4, q4, q5 + veor.u32 q13, q13, q0 + vld1.u32 {q5}, [r4]! + vadd.u32 q13, q13, q4 + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vst1.u32 {q12}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q14, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q12, #10 + veor.u32 q14, q14, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q14, q14, q0 + vshl.u32 q1, q5, #32-18 + vst1.u32 {q13}, [r1]! + veor.u32 q14, q14, q1 + vshr.u32 q0, q5, #3 + vadd.u32 q1, q6, q4 + veor.u32 q14, q14, q0 + + vshr.u32 q4, q13, #17 + vshl.u32 q0, q13, #32-17 + vadd.u32 q14, q14, q1 + vld1.u32 {q6}, [r4]! + vadd.u32 q5, q5, q15 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q15, q6, #7 + vshl.u32 q0, q6, #32-7 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshr.u32 q1, q13, #10 + vshr.u32 q0, q6, #18 + veor.u32 q4, q4, q1 + veor.u32 q15, q15, q0 + vshl.u32 q1, q6, #32-18 + vshr.u32 q0, q6, #3 + veor.u32 q15, q15, q1 + vadd.u32 q4, q4, q5 + veor.u32 q15, q15, q0 + vmov.u32 q5, #0x80000000 + vadd.u32 q15, q15, q4 + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vadd.u32 q6, q6, q9 + vst1.u32 {q14}, [r1]! + vmov.u32 q7, #0x11000000 + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + vadd.u32 q6, q6, q7 + vmov.u32 q2, #0x00002000 + veor.u32 q4, q4, q0 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vadd.u32 q6, q6, q2 + veor.u32 q1, q4, q1 + add r4, r4, #8*16 + + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vadd.u32 q9, q6, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q9}, [r1]! + vadd.u32 q5, q5, q10 + veor.u32 q4, q4, q1 + vshr.u32 q1, q15, #10 + vshl.u32 q0, q9, #32-17 + veor.u32 q10, q4, q1 + vshr.u32 q4, q9, #17 + vadd.u32 q10, q10, q5 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #19 + vshl.u32 q1, q9, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q9, #10 + veor.u32 q4, q4, q1 + vst1.u32 {q10}, [r1]! + veor.u32 q1, q4, q0 + + vshr.u32 q4, q10, #17 + vshl.u32 q0, q10, #32-17 + vadd.u32 q11, q11, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q10, #19 + vshl.u32 q1, q10, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q11}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q10, #10 + vshl.u32 q0, q11, #32-17 + veor.u32 q1, q4, q1 + vshr.u32 q4, q11, #17 + vadd.u32 q12, q12, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #19 + vshl.u32 q1, q11, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q11, #10 + veor.u32 q4, q4, q1 + vst1.u32 {q12}, [r1]! + veor.u32 q1, q4, q0 + + vshr.u32 q4, q12, #17 + vshl.u32 q0, q12, #32-17 + vadd.u32 q13, q13, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q12, #19 + vshl.u32 q1, q12, #32-19 + veor.u32 q4, q4, q0 + vst1.u32 {q13}, [r1]! + veor.u32 q4, q4, q1 + vshr.u32 q1, q12, #10 + vshl.u32 q0, q13, #32-17 + veor.u32 q1, q4, q1 + vshr.u32 q4, q13, #17 + vadd.u32 q14, q14, q1 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #19 + vshl.u32 q1, q13, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q0, q13, #10 + veor.u32 q4, q4, q1 + vst1.u32 {q14}, [r1]! + veor.u32 q4, q4, q0 + vmov.u32 q6, #0x00000100 + vadd.u32 q15, q15, q4 + + vshr.u32 q4, q14, #17 + vshl.u32 q0, q14, #32-17 + vmov.u32 q7, #0x00400000 + vst1.u32 {q15}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q14, #19 + vshl.u32 q1, q14, #32-19 + veor.u32 q4, q4, q0 + vadd.u32 q9, q9, q7 + veor.u32 q4, q4, q1 + vshr.u32 q1, q14, #10 + vmov.u32 q2, #0x00000022 + veor.u32 q4, q4, q1 + vadd.u32 q9, q9, q2 + vld1.u32 {q5}, [r4]! + vadd.u32 q9, q9, q4 + vshr.u32 q4, q15, #17 + vshl.u32 q0, q15, #32-17 + vadd.u32 q6, q6, q10 + vst1.u32 {q9}, [r1]! + veor.u32 q4, q4, q0 + vshr.u32 q0, q15, #19 + vshl.u32 q1, q15, #32-19 + veor.u32 q4, q4, q0 + vshr.u32 q10, q5, #7 + veor.u32 q4, q4, q1 + vshl.u32 q0, q5, #32-7 + vshr.u32 q1, q15, #10 + veor.u32 q10, q10, q0 + vshr.u32 q0, q5, #18 + veor.u32 q4, q4, q1 + veor.u32 q10, q10, q0 + vshl.u32 q1, q5, #32-18 + vshr.u32 q0, q5, #3 + veor.u32 q10, q10, q1 + vadd.u32 q1, q6, q4 + veor.u32 q10, q10, q0 + vld1.u32 {q6}, [r4]! + vadd.u32 q10, q10, q1 + + b sha256d_ms_4way_extend_loop2 + + .align 4 +sha256d_ms_4way_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + +sha256d_ms_4way_extend_coda2: + adr r4, sha256d_ms_4way_4h + mov r1, sp + vldmia r4, {q0-q7} + vmov.u32 q15, q7 + sub r3, r3, #64*16 + b sha256d_ms_4way_main_loop2 + +.macro sha256_4way_main_round_red i, rk, rw, rd, re, rf, rg, rh + vld1.u32 {q8}, [\rw]! + vand.u32 q9, \rf, \re + vbic.u32 q10, \rg, \re + vshr.u32 q11, \re, #5 + vorr.u32 q10, q10, q9 + vshl.u32 q12, \re, #32-5 + vadd.u32 \rh, \rh, q10 + veor.u32 q10, \re, q11 + vshr.u32 q11, \re, #19 + veor.u32 q10, q10, q12 + vshl.u32 q12, \re, #32-19 + veor.u32 q10, q10, q11 + vadd.u32 \rh, \rh, q8 + veor.u32 q10, q10, q12 + vld1.u32 {q9}, [\rk]! + vadd.u32 \rh, \rh, \rd + vshr.u32 q11, q10, #6 + vadd.u32 \rh, \rh, q9 + vshl.u32 q13, q10, #32-6 + vadd.u32 \rh, \rh, q11 + vadd.u32 \rh, \rh, q13 +.endm + +sha256d_ms_4way_finish: + sha256_4way_main_round_red 57, r3, r1, q2, q7, q4, q5, q6 + sha256_4way_main_round_red 58, r3, r1, q1, q6, q7, q4, q5 + sha256_4way_main_round_red 59, r3, r1, q0, q5, q6, q7, q4 + sha256_4way_main_round_red 60, r3, r1, q3, q4, q5, q6, q7 + + vadd.u32 q7, q7, q15 + add r0, r0, #7*16 + vst1.u32 {q7}, [r0] + + mov sp, r12 + vpop {q4-q7} + ldmfd sp!, {r4, pc} + + + .text + .code 32 + .align 2 + .globl sha256_use_4way + .globl _sha256_use_4way +#ifdef __ELF__ + .type sha256_use_4way, %function +#endif +sha256_use_4way: +_sha256_use_4way: + mov r0, #1 + bx lr + +#endif /* __ARM_NEON__ */ + +#endif diff --git a/sha2-x64.S b/sha2-x64.S new file mode 100644 index 00000000..909e2fae --- /dev/null +++ b/sha2-x64.S @@ -0,0 +1,3661 @@ +/* + * Copyright 2012-2013 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + +#if defined(__x86_64__) + + .data + .p2align 7 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .data + .p2align 7 +sha256_4k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .data + .p2align 6 +sha256d_4preext2_17: + .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 +sha256d_4preext2_23: + .long 0x11002000, 0x11002000, 0x11002000, 0x11002000 +sha256d_4preext2_24: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +sha256d_4preext2_30: + .long 0x00400022, 0x00400022, 0x00400022, 0x00400022 + + +#ifdef USE_AVX2 + + .data + .p2align 7 +sha256_8h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .data + .p2align 7 +sha256_8k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .data + .p2align 6 +sha256d_8preext2_17: + .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 +sha256d_8preext2_23: + .long 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000 +sha256d_8preext2_24: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 +sha256d_8preext2_30: + .long 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022 + +#endif /* USE_AVX2 */ + + + .text + .p2align 6 + .globl sha256_init_4way + .globl _sha256_init_4way +sha256_init_4way: +_sha256_init_4way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + movq %rcx, %rdi +#endif + movdqa sha256_4h+0(%rip), %xmm0 + movdqa sha256_4h+16(%rip), %xmm1 + movdqa sha256_4h+32(%rip), %xmm2 + movdqa sha256_4h+48(%rip), %xmm3 + movdqu %xmm0, 0(%rdi) + movdqu %xmm1, 16(%rdi) + movdqu %xmm2, 32(%rdi) + movdqu %xmm3, 48(%rdi) + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm1 + movdqa sha256_4h+96(%rip), %xmm2 + movdqa sha256_4h+112(%rip), %xmm3 + movdqu %xmm0, 64(%rdi) + movdqu %xmm1, 80(%rdi) + movdqu %xmm2, 96(%rdi) + movdqu %xmm3, 112(%rdi) +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rdi +#endif + ret + + +#ifdef USE_AVX2 + .text + .p2align 6 + .globl sha256_init_8way + .globl _sha256_init_8way +sha256_init_8way: +_sha256_init_8way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + movq %rcx, %rdi +#endif + vpbroadcastd sha256_4h+0(%rip), %ymm0 + vpbroadcastd sha256_4h+16(%rip), %ymm1 + vpbroadcastd sha256_4h+32(%rip), %ymm2 + vpbroadcastd sha256_4h+48(%rip), %ymm3 + vmovdqu %ymm0, 0*32(%rdi) + vmovdqu %ymm1, 1*32(%rdi) + vmovdqu %ymm2, 2*32(%rdi) + vmovdqu %ymm3, 3*32(%rdi) + vpbroadcastd sha256_4h+64(%rip), %ymm0 + vpbroadcastd sha256_4h+80(%rip), %ymm1 + vpbroadcastd sha256_4h+96(%rip), %ymm2 + vpbroadcastd sha256_4h+112(%rip), %ymm3 + vmovdqu %ymm0, 4*32(%rdi) + vmovdqu %ymm1, 5*32(%rdi) + vmovdqu %ymm2, 6*32(%rdi) + vmovdqu %ymm3, 7*32(%rdi) +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rdi +#endif + ret +#endif /* USE_AVX2 */ + + +.macro sha256_sse2_extend_round i + movdqa (\i-15)*16(%rax), %xmm0 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd (\i-16)*16(%rax), %xmm0 + paddd (\i-7)*16(%rax), %xmm0 + + movdqa %xmm3, %xmm2 + psrld $10, %xmm3 + pslld $13, %xmm2 + movdqa %xmm3, %xmm1 + psrld $7, %xmm1 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + psrld $2, %xmm1 + pslld $2, %xmm2 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + paddd %xmm0, %xmm3 + movdqa %xmm3, \i*16(%rax) +.endm + +.macro sha256_sse2_extend_doubleround i + movdqa (\i-15)*16(%rax), %xmm0 + movdqa (\i-14)*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (\i-16)*16(%rax), %xmm0 + paddd (\i-15)*16(%rax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (\i-7)*16(%rax), %xmm0 + paddd (\i-6)*16(%rax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, \i*16(%rax) + movdqa %xmm7, (\i+1)*16(%rax) +.endm + +.macro sha256_sse2_main_round i + movdqa 16*(\i)(%rax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%rsp), %xmm6 + + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(\i)(%rcx), %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 +.endm + +.macro sha256_sse2_main_quadround i + sha256_sse2_main_round \i+0 + sha256_sse2_main_round \i+1 + sha256_sse2_main_round \i+2 + sha256_sse2_main_round \i+3 +.endm + + +#if defined(USE_AVX) + +.macro sha256_avx_extend_round i + vmovdqa (\i-15)*16(%rax), %xmm0 + vpslld $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpsrld $4, %xmm0, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpsrld $11, %xmm1, %xmm1 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpsrld $10, %xmm3, %xmm3 + vpsrld $7, %xmm3, %xmm1 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpsrld $2, %xmm1, %xmm1 + vpslld $2, %xmm2, %xmm2 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, \i*16(%rax) +.endm + +.macro sha256_avx_extend_doubleround i + vmovdqa (\i-15)*16(%rax), %xmm0 + vmovdqa (\i-14)*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd (\i-16)*16(%rax), %xmm8, %xmm0 + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, \i*16(%rax) + vmovdqa %xmm7, (\i+1)*16(%rax) +.endm + +.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 + vpaddd 16*(\i)(%rax), \r0, %xmm6 + vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 + + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $7, \r3, %xmm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %xmm2 + vpxor %xmm1, \r0, \r0 + vpxor %xmm2, \r0, \r0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, \r0, \r0 + vpxor %xmm2, \r0, \r0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, \r0, \r0 + vpaddd \r0, %xmm6, %xmm6 + vpaddd %xmm6, \r4, \r0 + + vpand \r6, \r5, %xmm2 + vpand \r7, \r5, \r4 + vpand \r7, \r6, %xmm1 + vpxor \r4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vpslld $10, \r7, %xmm2 + vpsrld $2, \r7, \r4 + vpsrld $11, \r4, %xmm1 + vpxor %xmm2, \r4, \r4 + vpxor %xmm1, \r4, \r4 + vpslld $9, %xmm2, %xmm2 + vpsrld $9, %xmm1, %xmm1 + vpxor %xmm2, \r4, \r4 + vpxor %xmm1, \r4, \r4 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm2, \r4, \r4 + vpaddd %xmm6, \r4, \r4 +.endm + +.macro sha256_avx_main_quadround i + sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 +.endm + +#endif /* USE_AVX */ + + +#if defined(USE_AVX2) + +.macro sha256_avx2_extend_round i + vmovdqa (\i-15)*32(%rax), %ymm0 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm0 + vpsrld $4, %ymm0, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpaddd (\i-16)*32(%rax), %ymm0, %ymm0 + vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpsrld $10, %ymm3, %ymm3 + vpsrld $7, %ymm3, %ymm1 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm2, %ymm3, %ymm3 + vpsrld $2, %ymm1, %ymm1 + vpslld $2, %ymm2, %ymm2 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, \i*32(%rax) +.endm + +.macro sha256_avx2_extend_doubleround i + vmovdqa (\i-15)*32(%rax), %ymm0 + vmovdqa (\i-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (\i-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 + vpaddd (\i-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, \i*32(%rax) + vmovdqa %ymm7, (\i+1)*32(%rax) +.endm + +.macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 + vpaddd 32*(\i)(%rax), \r0, %ymm6 + vpaddd 32*(\i)(%rcx), %ymm6, %ymm6 + + vpandn \r1, \r3, %ymm1 + vpand \r3, \r2, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, \r3, %ymm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, \r0, \r0 + vpaddd \r0, %ymm6, %ymm6 + vpaddd %ymm6, \r4, \r0 + + vpand \r6, \r5, %ymm2 + vpand \r7, \r5, \r4 + vpand \r7, \r6, %ymm1 + vpxor \r4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, \r7, %ymm2 + vpsrld $2, \r7, \r4 + vpsrld $11, \r4, %ymm1 + vpxor %ymm2, \r4, \r4 + vpxor %ymm1, \r4, \r4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, \r4, \r4 + vpxor %ymm1, \r4, \r4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, \r4, \r4 + vpaddd %ymm6, \r4, \r4 +.endm + +.macro sha256_avx2_main_quadround i + sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 + sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 + sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 + sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 +.endm + +#endif /* USE_AVX2 */ + + +#if defined(USE_XOP) + +.macro sha256_xop_extend_round i + vmovdqa (\i-15)*16(%rax), %xmm0 + vprotd $25, %xmm0, %xmm1 + vprotd $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm0, %xmm0 + + vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + + vprotd $15, %xmm3, %xmm1 + vprotd $13, %xmm3, %xmm2 + vpsrld $10, %xmm3, %xmm3 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm3, %xmm3 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, \i*16(%rax) +.endm + +.macro sha256_xop_extend_doubleround i + vmovdqa (\i-15)*16(%rax), %xmm0 + vmovdqa (\i-14)*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm0 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm0, %xmm0 + vpxor %xmm6, %xmm4, %xmm4 + + vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-15)*16(%rax), %xmm4, %xmm4 + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + + vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 + vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 + + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, \i*16(%rax) + vmovdqa %xmm7, (\i+1)*16(%rax) +.endm + +.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 + vpaddd 16*(\i)(%rax), \r0, %xmm6 + vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 + + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $26, \r3, %xmm1 + vprotd $21, \r3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, \r3, \r0 + vpxor %xmm2, \r0, \r0 + vpaddd \r0, %xmm6, %xmm6 + vpaddd %xmm6, \r4, \r0 + + vpand \r6, \r5, %xmm2 + vpand \r7, \r5, \r4 + vpand \r7, \r6, %xmm1 + vpxor \r4, %xmm1, %xmm1 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + + vprotd $30, \r7, %xmm1 + vprotd $19, \r7, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $10, \r7, \r4 + vpxor %xmm2, \r4, \r4 + vpaddd %xmm6, \r4, \r4 +.endm + +.macro sha256_xop_main_quadround i + sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 +.endm + +#endif /* USE_XOP */ + + + .text + .p2align 6 +sha256_transform_4way_core_sse2: + leaq 256(%rsp), %rcx + leaq 48*16(%rcx), %rax + movdqa -2*16(%rcx), %xmm3 + movdqa -1*16(%rcx), %xmm7 +sha256_transform_4way_sse2_extend_loop: + movdqa -15*16(%rcx), %xmm0 + movdqa -14*16(%rcx), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd -16*16(%rcx), %xmm0 + paddd -15*16(%rcx), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd -7*16(%rcx), %xmm0 + paddd -6*16(%rcx), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, (%rcx) + movdqa %xmm7, 16(%rcx) + addq $2*16, %rcx + cmpq %rcx, %rax + jne sha256_transform_4way_sse2_extend_loop + + movdqu 0(%rdi), %xmm7 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm4 + movdqu 48(%rdi), %xmm3 + movdqu 64(%rdi), %xmm0 + movdqu 80(%rdi), %xmm8 + movdqu 96(%rdi), %xmm9 + movdqu 112(%rdi), %xmm10 + + leaq sha256_4k(%rip), %rcx + xorq %rax, %rax +sha256_transform_4way_sse2_main_loop: + movdqa (%rsp, %rax), %xmm6 + paddd (%rcx, %rax), %xmm6 + paddd %xmm10, %xmm6 + + movdqa %xmm0, %xmm1 + movdqa %xmm9, %xmm2 + pandn %xmm2, %xmm1 + + movdqa %xmm2, %xmm10 + movdqa %xmm8, %xmm2 + movdqa %xmm2, %xmm9 + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, %xmm8 + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + paddd %xmm6, %xmm0 + + movdqa %xmm5, %xmm1 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + pand %xmm5, %xmm2 + pand %xmm7, %xmm4 + pand %xmm7, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $9, %xmm2 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $11, %xmm2 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + + addq $16, %rax + cmpq $16*64, %rax + jne sha256_transform_4way_sse2_main_loop + jmp sha256_transform_4way_finish + + +#if defined(USE_AVX) + .text + .p2align 6 +sha256_transform_4way_core_avx: + leaq 256(%rsp), %rax + movdqa -2*16(%rax), %xmm3 + movdqa -1*16(%rax), %xmm7 + sha256_avx_extend_doubleround 0 + sha256_avx_extend_doubleround 2 + sha256_avx_extend_doubleround 4 + sha256_avx_extend_doubleround 6 + sha256_avx_extend_doubleround 8 + sha256_avx_extend_doubleround 10 + sha256_avx_extend_doubleround 12 + sha256_avx_extend_doubleround 14 + sha256_avx_extend_doubleround 16 + sha256_avx_extend_doubleround 18 + sha256_avx_extend_doubleround 20 + sha256_avx_extend_doubleround 22 + sha256_avx_extend_doubleround 24 + sha256_avx_extend_doubleround 26 + sha256_avx_extend_doubleround 28 + sha256_avx_extend_doubleround 30 + sha256_avx_extend_doubleround 32 + sha256_avx_extend_doubleround 34 + sha256_avx_extend_doubleround 36 + sha256_avx_extend_doubleround 38 + sha256_avx_extend_doubleround 40 + sha256_avx_extend_doubleround 42 + sha256_avx_extend_doubleround 44 + sha256_avx_extend_doubleround 46 + movdqu 0(%rdi), %xmm7 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm4 + movdqu 48(%rdi), %xmm3 + movdqu 64(%rdi), %xmm0 + movdqu 80(%rdi), %xmm8 + movdqu 96(%rdi), %xmm9 + movdqu 112(%rdi), %xmm10 + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + sha256_avx_main_quadround 0 + sha256_avx_main_quadround 4 + sha256_avx_main_quadround 8 + sha256_avx_main_quadround 12 + sha256_avx_main_quadround 16 + sha256_avx_main_quadround 20 + sha256_avx_main_quadround 24 + sha256_avx_main_quadround 28 + sha256_avx_main_quadround 32 + sha256_avx_main_quadround 36 + sha256_avx_main_quadround 40 + sha256_avx_main_quadround 44 + sha256_avx_main_quadround 48 + sha256_avx_main_quadround 52 + sha256_avx_main_quadround 56 + sha256_avx_main_quadround 60 + jmp sha256_transform_4way_finish +#endif /* USE_AVX */ + + +#if defined(USE_XOP) + .text + .p2align 6 +sha256_transform_4way_core_xop: + leaq 256(%rsp), %rax + movdqa -2*16(%rax), %xmm3 + movdqa -1*16(%rax), %xmm7 + sha256_xop_extend_doubleround 0 + sha256_xop_extend_doubleround 2 + sha256_xop_extend_doubleround 4 + sha256_xop_extend_doubleround 6 + sha256_xop_extend_doubleround 8 + sha256_xop_extend_doubleround 10 + sha256_xop_extend_doubleround 12 + sha256_xop_extend_doubleround 14 + sha256_xop_extend_doubleround 16 + sha256_xop_extend_doubleround 18 + sha256_xop_extend_doubleround 20 + sha256_xop_extend_doubleround 22 + sha256_xop_extend_doubleround 24 + sha256_xop_extend_doubleround 26 + sha256_xop_extend_doubleround 28 + sha256_xop_extend_doubleround 30 + sha256_xop_extend_doubleround 32 + sha256_xop_extend_doubleround 34 + sha256_xop_extend_doubleround 36 + sha256_xop_extend_doubleround 38 + sha256_xop_extend_doubleround 40 + sha256_xop_extend_doubleround 42 + sha256_xop_extend_doubleround 44 + sha256_xop_extend_doubleround 46 + movdqu 0(%rdi), %xmm7 + movdqu 16(%rdi), %xmm5 + movdqu 32(%rdi), %xmm4 + movdqu 48(%rdi), %xmm3 + movdqu 64(%rdi), %xmm0 + movdqu 80(%rdi), %xmm8 + movdqu 96(%rdi), %xmm9 + movdqu 112(%rdi), %xmm10 + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + sha256_xop_main_quadround 0 + sha256_xop_main_quadround 4 + sha256_xop_main_quadround 8 + sha256_xop_main_quadround 12 + sha256_xop_main_quadround 16 + sha256_xop_main_quadround 20 + sha256_xop_main_quadround 24 + sha256_xop_main_quadround 28 + sha256_xop_main_quadround 32 + sha256_xop_main_quadround 36 + sha256_xop_main_quadround 40 + sha256_xop_main_quadround 44 + sha256_xop_main_quadround 48 + sha256_xop_main_quadround 52 + sha256_xop_main_quadround 56 + sha256_xop_main_quadround 60 + jmp sha256_transform_4way_finish +#endif /* USE_XOP */ + + + .data + .p2align 3 +sha256_transform_4way_core_addr: + .quad 0x0 + +.macro p2bswap_rsi_rsp i + movdqu \i*16(%rsi), %xmm0 + movdqu (\i+1)*16(%rsi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, \i*16(%rsp) + movdqa %xmm2, (\i+1)*16(%rsp) +.endm + + .text + .p2align 6 + .globl sha256_transform_4way + .globl _sha256_transform_4way +sha256_transform_4way: +_sha256_transform_4way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $96, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + movdqa %xmm11, 80(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $1032, %rsp + andq $-128, %rsp + + testq %rdx, %rdx + jnz sha256_transform_4way_swap + + movdqu 0*16(%rsi), %xmm0 + movdqu 1*16(%rsi), %xmm1 + movdqu 2*16(%rsi), %xmm2 + movdqu 3*16(%rsi), %xmm3 + movdqu 4*16(%rsi), %xmm4 + movdqu 5*16(%rsi), %xmm5 + movdqu 6*16(%rsi), %xmm6 + movdqu 7*16(%rsi), %xmm7 + movdqa %xmm0, 0*16(%rsp) + movdqa %xmm1, 1*16(%rsp) + movdqa %xmm2, 2*16(%rsp) + movdqa %xmm3, 3*16(%rsp) + movdqa %xmm4, 4*16(%rsp) + movdqa %xmm5, 5*16(%rsp) + movdqa %xmm6, 6*16(%rsp) + movdqa %xmm7, 7*16(%rsp) + movdqu 8*16(%rsi), %xmm0 + movdqu 9*16(%rsi), %xmm1 + movdqu 10*16(%rsi), %xmm2 + movdqu 11*16(%rsi), %xmm3 + movdqu 12*16(%rsi), %xmm4 + movdqu 13*16(%rsi), %xmm5 + movdqu 14*16(%rsi), %xmm6 + movdqu 15*16(%rsi), %xmm7 + movdqa %xmm0, 8*16(%rsp) + movdqa %xmm1, 9*16(%rsp) + movdqa %xmm2, 10*16(%rsp) + movdqa %xmm3, 11*16(%rsp) + movdqa %xmm4, 12*16(%rsp) + movdqa %xmm5, 13*16(%rsp) + movdqa %xmm6, 14*16(%rsp) + movdqa %xmm7, 15*16(%rsp) + jmp *sha256_transform_4way_core_addr(%rip) + + .p2align 6 +sha256_transform_4way_swap: + p2bswap_rsi_rsp 0 + p2bswap_rsi_rsp 2 + p2bswap_rsi_rsp 4 + p2bswap_rsi_rsp 6 + p2bswap_rsi_rsp 8 + p2bswap_rsi_rsp 10 + p2bswap_rsi_rsp 12 + p2bswap_rsi_rsp 14 + jmp *sha256_transform_4way_core_addr(%rip) + + .p2align 6 +sha256_transform_4way_finish: + movdqu 0(%rdi), %xmm2 + movdqu 16(%rdi), %xmm6 + movdqu 32(%rdi), %xmm11 + movdqu 48(%rdi), %xmm1 + paddd %xmm2, %xmm7 + paddd %xmm6, %xmm5 + paddd %xmm11, %xmm4 + paddd %xmm1, %xmm3 + movdqu 64(%rdi), %xmm2 + movdqu 80(%rdi), %xmm6 + movdqu 96(%rdi), %xmm11 + movdqu 112(%rdi), %xmm1 + paddd %xmm2, %xmm0 + paddd %xmm6, %xmm8 + paddd %xmm11, %xmm9 + paddd %xmm1, %xmm10 + + movdqu %xmm7, 0(%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm4, 32(%rdi) + movdqu %xmm3, 48(%rdi) + movdqu %xmm0, 64(%rdi) + movdqu %xmm8, 80(%rdi) + movdqu %xmm9, 96(%rdi) + movdqu %xmm10, 112(%rdi) + + movq %r8, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + movdqa 80(%rsp), %xmm11 + addq $96, %rsp + popq %rdi +#endif + ret + + +#ifdef USE_AVX2 + + .text + .p2align 6 +sha256_transform_8way_core_avx2: + leaq 8*64(%rsp), %rax + vmovdqa -2*32(%rax), %ymm3 + vmovdqa -1*32(%rax), %ymm7 + sha256_avx2_extend_doubleround 0 + sha256_avx2_extend_doubleround 2 + sha256_avx2_extend_doubleround 4 + sha256_avx2_extend_doubleround 6 + sha256_avx2_extend_doubleround 8 + sha256_avx2_extend_doubleround 10 + sha256_avx2_extend_doubleround 12 + sha256_avx2_extend_doubleround 14 + sha256_avx2_extend_doubleround 16 + sha256_avx2_extend_doubleround 18 + sha256_avx2_extend_doubleround 20 + sha256_avx2_extend_doubleround 22 + sha256_avx2_extend_doubleround 24 + sha256_avx2_extend_doubleround 26 + sha256_avx2_extend_doubleround 28 + sha256_avx2_extend_doubleround 30 + sha256_avx2_extend_doubleround 32 + sha256_avx2_extend_doubleround 34 + sha256_avx2_extend_doubleround 36 + sha256_avx2_extend_doubleround 38 + sha256_avx2_extend_doubleround 40 + sha256_avx2_extend_doubleround 42 + sha256_avx2_extend_doubleround 44 + sha256_avx2_extend_doubleround 46 + vmovdqu 0*32(%rdi), %ymm7 + vmovdqu 1*32(%rdi), %ymm5 + vmovdqu 2*32(%rdi), %ymm4 + vmovdqu 3*32(%rdi), %ymm3 + vmovdqu 4*32(%rdi), %ymm0 + vmovdqu 5*32(%rdi), %ymm8 + vmovdqu 6*32(%rdi), %ymm9 + vmovdqu 7*32(%rdi), %ymm10 + movq %rsp, %rax + leaq sha256_8k(%rip), %rcx + sha256_avx2_main_quadround 0 + sha256_avx2_main_quadround 4 + sha256_avx2_main_quadround 8 + sha256_avx2_main_quadround 12 + sha256_avx2_main_quadround 16 + sha256_avx2_main_quadround 20 + sha256_avx2_main_quadround 24 + sha256_avx2_main_quadround 28 + sha256_avx2_main_quadround 32 + sha256_avx2_main_quadround 36 + sha256_avx2_main_quadround 40 + sha256_avx2_main_quadround 44 + sha256_avx2_main_quadround 48 + sha256_avx2_main_quadround 52 + sha256_avx2_main_quadround 56 + sha256_avx2_main_quadround 60 + jmp sha256_transform_8way_finish + +.macro p2bswap_avx2_rsi_rsp i + vmovdqu \i*32(%rsi), %ymm0 + vmovdqu (\i+1)*32(%rsi), %ymm2 + vpshuflw $0xb1, %ymm0, %ymm0 + vpshuflw $0xb1, %ymm2, %ymm2 + vpshufhw $0xb1, %ymm0, %ymm0 + vpshufhw $0xb1, %ymm2, %ymm2 + vpsrlw $8, %ymm0, %ymm1 + vpsrlw $8, %ymm2, %ymm3 + vpsllw $8, %ymm0, %ymm0 + vpsllw $8, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm3, %ymm2, %ymm2 + vmovdqa %ymm0, \i*32(%rsp) + vmovdqa %ymm2, (\i+1)*32(%rsp) +.endm + + .text + .p2align 6 + .globl sha256_transform_8way + .globl _sha256_transform_8way +sha256_transform_8way: +_sha256_transform_8way: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $96, %rsp + vmovdqa %xmm6, 0(%rsp) + vmovdqa %xmm7, 16(%rsp) + vmovdqa %xmm8, 32(%rsp) + vmovdqa %xmm9, 48(%rsp) + vmovdqa %xmm10, 64(%rsp) + vmovdqa %xmm11, 80(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $64*32, %rsp + andq $-128, %rsp + + testq %rdx, %rdx + jnz sha256_transform_8way_swap + + vmovdqu 0*32(%rsi), %ymm0 + vmovdqu 1*32(%rsi), %ymm1 + vmovdqu 2*32(%rsi), %ymm2 + vmovdqu 3*32(%rsi), %ymm3 + vmovdqu 4*32(%rsi), %ymm4 + vmovdqu 5*32(%rsi), %ymm5 + vmovdqu 6*32(%rsi), %ymm6 + vmovdqu 7*32(%rsi), %ymm7 + vmovdqa %ymm0, 0*32(%rsp) + vmovdqa %ymm1, 1*32(%rsp) + vmovdqa %ymm2, 2*32(%rsp) + vmovdqa %ymm3, 3*32(%rsp) + vmovdqa %ymm4, 4*32(%rsp) + vmovdqa %ymm5, 5*32(%rsp) + vmovdqa %ymm6, 6*32(%rsp) + vmovdqa %ymm7, 7*32(%rsp) + vmovdqu 8*32(%rsi), %ymm0 + vmovdqu 9*32(%rsi), %ymm1 + vmovdqu 10*32(%rsi), %ymm2 + vmovdqu 11*32(%rsi), %ymm3 + vmovdqu 12*32(%rsi), %ymm4 + vmovdqu 13*32(%rsi), %ymm5 + vmovdqu 14*32(%rsi), %ymm6 + vmovdqu 15*32(%rsi), %ymm7 + vmovdqa %ymm0, 8*32(%rsp) + vmovdqa %ymm1, 9*32(%rsp) + vmovdqa %ymm2, 10*32(%rsp) + vmovdqa %ymm3, 11*32(%rsp) + vmovdqa %ymm4, 12*32(%rsp) + vmovdqa %ymm5, 13*32(%rsp) + vmovdqa %ymm6, 14*32(%rsp) + vmovdqa %ymm7, 15*32(%rsp) + jmp sha256_transform_8way_core_avx2 + + .p2align 6 +sha256_transform_8way_swap: + p2bswap_avx2_rsi_rsp 0 + p2bswap_avx2_rsi_rsp 2 + p2bswap_avx2_rsi_rsp 4 + p2bswap_avx2_rsi_rsp 6 + p2bswap_avx2_rsi_rsp 8 + p2bswap_avx2_rsi_rsp 10 + p2bswap_avx2_rsi_rsp 12 + p2bswap_avx2_rsi_rsp 14 + jmp sha256_transform_8way_core_avx2 + + .p2align 6 +sha256_transform_8way_finish: + vmovdqu 0*32(%rdi), %ymm2 + vmovdqu 1*32(%rdi), %ymm6 + vmovdqu 2*32(%rdi), %ymm11 + vmovdqu 3*32(%rdi), %ymm1 + vpaddd %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd %ymm11, %ymm4, %ymm4 + vpaddd %ymm1, %ymm3, %ymm3 + vmovdqu 4*32(%rdi), %ymm2 + vmovdqu 5*32(%rdi), %ymm6 + vmovdqu 6*32(%rdi), %ymm11 + vmovdqu 7*32(%rdi), %ymm1 + vpaddd %ymm2, %ymm0, %ymm0 + vpaddd %ymm6, %ymm8, %ymm8 + vpaddd %ymm11, %ymm9, %ymm9 + vpaddd %ymm1, %ymm10, %ymm10 + + vmovdqu %ymm7, 0*32(%rdi) + vmovdqu %ymm5, 1*32(%rdi) + vmovdqu %ymm4, 2*32(%rdi) + vmovdqu %ymm3, 3*32(%rdi) + vmovdqu %ymm0, 4*32(%rdi) + vmovdqu %ymm8, 5*32(%rdi) + vmovdqu %ymm9, 6*32(%rdi) + vmovdqu %ymm10, 7*32(%rdi) + + movq %r8, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + vmovdqa 0(%rsp), %xmm6 + vmovdqa 16(%rsp), %xmm7 + vmovdqa 32(%rsp), %xmm8 + vmovdqa 48(%rsp), %xmm9 + vmovdqa 64(%rsp), %xmm10 + vmovdqa 80(%rsp), %xmm11 + addq $96, %rsp + popq %rdi +#endif + ret + +#endif /* USE_AVX2 */ + + + .data + .p2align 3 +sha256d_ms_4way_addr: + .quad 0x0 + + .text + .p2align 6 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way +sha256d_ms_4way: +_sha256d_ms_4way: + jmp *sha256d_ms_4way_addr(%rip) + + + .p2align 6 +sha256d_ms_4way_sse2: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $32, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + subq $8+67*16, %rsp + + leaq 256(%rsi), %rax + +sha256d_ms_4way_sse2_extend_loop1: + movdqa 3*16(%rsi), %xmm0 + movdqa 2*16(%rax), %xmm3 + movdqa 3*16(%rax), %xmm7 + movdqa %xmm3, 5*16(%rsp) + movdqa %xmm7, 6*16(%rsp) + movdqa %xmm0, %xmm2 + paddd %xmm0, %xmm7 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd %xmm0, %xmm3 + movdqa %xmm3, 2*16(%rax) + movdqa %xmm7, 3*16(%rax) + + movdqa 4*16(%rax), %xmm0 + movdqa %xmm0, 7*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + movdqa %xmm3, 4*16(%rax) + movdqa %xmm7, 5*16(%rax) + + movdqa 6*16(%rax), %xmm0 + movdqa 7*16(%rax), %xmm4 + movdqa %xmm0, 9*16(%rsp) + movdqa %xmm4, 10*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 6*16(%rax) + movdqa %xmm7, 7*16(%rax) + + movdqa 8*16(%rax), %xmm0 + movdqa 2*16(%rax), %xmm4 + movdqa %xmm0, 11*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 8*16(%rax) + movdqa %xmm7, 9*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%rax), %xmm3 + paddd 4*16(%rax), %xmm7 + movdqa %xmm3, 10*16(%rax) + movdqa %xmm7, 11*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%rax), %xmm3 + paddd 6*16(%rax), %xmm7 + movdqa %xmm3, 12*16(%rax) + movdqa %xmm7, 13*16(%rax) + + movdqa 14*16(%rax), %xmm0 + movdqa 15*16(%rax), %xmm4 + movdqa %xmm0, 17*16(%rsp) + movdqa %xmm4, 18*16(%rsp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%rax), %xmm0 + paddd 8*16(%rax), %xmm4 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%rax) + movdqa %xmm7, 15*16(%rax) + +sha256d_ms_4way_sse2_extend_loop2: + sha256_sse2_extend_doubleround 16 + sha256_sse2_extend_doubleround 18 + sha256_sse2_extend_doubleround 20 + sha256_sse2_extend_doubleround 22 + sha256_sse2_extend_doubleround 24 + sha256_sse2_extend_doubleround 26 + sha256_sse2_extend_doubleround 28 + sha256_sse2_extend_doubleround 30 + sha256_sse2_extend_doubleround 32 + sha256_sse2_extend_doubleround 34 + sha256_sse2_extend_doubleround 36 + sha256_sse2_extend_doubleround 38 + sha256_sse2_extend_doubleround 40 + sha256_sse2_extend_doubleround 42 + jz sha256d_ms_4way_sse2_extend_coda2 + sha256_sse2_extend_doubleround 44 + sha256_sse2_extend_doubleround 46 + + movdqa 0(%rcx), %xmm3 + movdqa 16(%rcx), %xmm0 + movdqa 32(%rcx), %xmm1 + movdqa 48(%rcx), %xmm2 + movdqa 64(%rcx), %xmm6 + movdqa 80(%rcx), %xmm7 + movdqa 96(%rcx), %xmm5 + movdqa 112(%rcx), %xmm4 + movdqa %xmm1, 0(%rsp) + movdqa %xmm2, 16(%rsp) + movdqa %xmm6, 32(%rsp) + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_sse2_main_loop1 + +sha256d_ms_4way_sse2_main_loop2: + sha256_sse2_main_round 0 + sha256_sse2_main_round 1 + sha256_sse2_main_round 2 +sha256d_ms_4way_sse2_main_loop1: + sha256_sse2_main_round 3 + sha256_sse2_main_quadround 4 + sha256_sse2_main_quadround 8 + sha256_sse2_main_quadround 12 + sha256_sse2_main_quadround 16 + sha256_sse2_main_quadround 20 + sha256_sse2_main_quadround 24 + sha256_sse2_main_quadround 28 + sha256_sse2_main_quadround 32 + sha256_sse2_main_quadround 36 + sha256_sse2_main_quadround 40 + sha256_sse2_main_quadround 44 + sha256_sse2_main_quadround 48 + sha256_sse2_main_quadround 52 + sha256_sse2_main_round 56 + jz sha256d_ms_4way_sse2_finish + sha256_sse2_main_round 57 + sha256_sse2_main_round 58 + sha256_sse2_main_round 59 + sha256_sse2_main_quadround 60 + + movdqa 5*16(%rsp), %xmm1 + movdqa 6*16(%rsp), %xmm2 + movdqa 7*16(%rsp), %xmm6 + movdqa %xmm1, 18*16(%rsi) + movdqa %xmm2, 19*16(%rsi) + movdqa %xmm6, 20*16(%rsi) + movdqa 9*16(%rsp), %xmm1 + movdqa 10*16(%rsp), %xmm2 + movdqa 11*16(%rsp), %xmm6 + movdqa %xmm1, 22*16(%rsi) + movdqa %xmm2, 23*16(%rsi) + movdqa %xmm6, 24*16(%rsi) + movdqa 17*16(%rsp), %xmm1 + movdqa 18*16(%rsp), %xmm2 + movdqa %xmm1, 30*16(%rsi) + movdqa %xmm2, 31*16(%rsi) + + movdqa 0(%rsp), %xmm1 + movdqa 16(%rsp), %xmm2 + movdqa 32(%rsp), %xmm6 + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm1 + paddd 96(%rdx), %xmm2 + paddd 112(%rdx), %xmm6 + + movdqa %xmm7, 48+0(%rsp) + movdqa %xmm5, 48+16(%rsp) + movdqa %xmm4, 48+32(%rsp) + movdqa %xmm3, 48+48(%rsp) + movdqa %xmm0, 48+64(%rsp) + movdqa %xmm1, 48+80(%rsp) + movdqa %xmm2, 48+96(%rsp) + movdqa %xmm6, 48+112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 48+128(%rsp) + movdqa %xmm0, 48+144(%rsp) + movdqa %xmm0, 48+160(%rsp) + movdqa %xmm0, 48+176(%rsp) + movdqa %xmm0, 48+192(%rsp) + movdqa %xmm0, 48+208(%rsp) + movdqa %xmm0, 48+224(%rsp) + movdqa %xmm1, 48+240(%rsp) + + leaq 19*16(%rsp), %rax + cmpq %rax, %rax + + movdqa -15*16(%rax), %xmm0 + movdqa -14*16(%rax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + paddd -16*16(%rax), %xmm0 + paddd -15*16(%rax), %xmm4 + paddd sha256d_4preext2_17(%rip), %xmm4 + movdqa %xmm0, %xmm3 + movdqa %xmm4, %xmm7 + movdqa %xmm3, 0*16(%rax) + movdqa %xmm7, 1*16(%rax) + + sha256_sse2_extend_doubleround 2 + sha256_sse2_extend_doubleround 4 + + movdqa -9*16(%rax), %xmm0 + movdqa sha256d_4preext2_23(%rip), %xmm4 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd -10*16(%rax), %xmm0 + paddd -9*16(%rax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd -1*16(%rax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + paddd 0*16(%rax), %xmm4 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 6*16(%rax) + movdqa %xmm7, 7*16(%rax) + + movdqa sha256d_4preext2_24(%rip), %xmm0 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 1*16(%rax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd 2*16(%rax), %xmm7 + movdqa %xmm3, 8*16(%rax) + movdqa %xmm7, 9*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%rax), %xmm3 + paddd 4*16(%rax), %xmm7 + movdqa %xmm3, 10*16(%rax) + movdqa %xmm7, 11*16(%rax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%rax), %xmm3 + paddd 6*16(%rax), %xmm7 + movdqa %xmm3, 12*16(%rax) + movdqa %xmm7, 13*16(%rax) + + movdqa sha256d_4preext2_30(%rip), %xmm0 + movdqa 0*16(%rax), %xmm4 + movdqa %xmm4, %xmm6 + psrld $3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $14, %xmm6 + psrld $4, %xmm5 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + psrld $11, %xmm5 + pslld $11, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + paddd -1*16(%rax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%rax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + paddd 8*16(%rax), %xmm4 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%rax) + movdqa %xmm7, 15*16(%rax) + + jmp sha256d_ms_4way_sse2_extend_loop2 + +sha256d_ms_4way_sse2_extend_coda2: + sha256_sse2_extend_round 44 + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm1 + movdqa sha256_4h+96(%rip), %xmm2 + movdqa sha256_4h+112(%rip), %xmm6 + movdqa %xmm1, 0(%rsp) + movdqa %xmm2, 16(%rsp) + movdqa %xmm6, 32(%rsp) + + leaq 48(%rsp), %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_sse2_main_loop2 + +.macro sha256_sse2_main_round_red i, r7 + movdqa 16*\i(%rax), %xmm6 + paddd 16*\i(%rcx), %xmm6 + paddd 32(%rsp), %xmm6 + movdqa %xmm0, %xmm1 + movdqa 16(%rsp), %xmm2 + paddd \r7, %xmm6 + pandn %xmm2, %xmm1 + movdqa %xmm2, 32(%rsp) + movdqa 0(%rsp), %xmm2 + movdqa %xmm2, 16(%rsp) + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%rsp) + paddd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm6, %xmm0 +.endm + +sha256d_ms_4way_sse2_finish: + sha256_sse2_main_round_red 57, %xmm3 + sha256_sse2_main_round_red 58, %xmm4 + sha256_sse2_main_round_red 59, %xmm5 + sha256_sse2_main_round_red 60, %xmm7 + + paddd sha256_4h+112(%rip), %xmm0 + movdqa %xmm0, 112(%rdi) + + addq $8+67*16, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + addq $32, %rsp + popq %rdi +#endif + ret + + +#if defined(USE_AVX) + + .p2align 6 +sha256d_ms_4way_avx: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $80, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + subq $1032, %rsp + + leaq 256(%rsi), %rax + +sha256d_ms_4way_avx_extend_loop1: + vmovdqa 3*16(%rsi), %xmm0 + vmovdqa 2*16(%rax), %xmm3 + vmovdqa 3*16(%rax), %xmm7 + vmovdqa %xmm3, 2*16(%rsp) + vmovdqa %xmm7, 3*16(%rsp) + vpaddd %xmm0, %xmm7, %xmm7 + vpslld $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpsrld $4, %xmm0, %xmm1 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpsrld $11, %xmm1, %xmm1 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm1, %xmm0, %xmm0 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 2*16(%rax) + vmovdqa %xmm7, 3*16(%rax) + + vmovdqa 4*16(%rax), %xmm0 + vmovdqa %xmm0, 4*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 4*16(%rax) + vmovdqa %xmm7, 5*16(%rax) + + vmovdqa 6*16(%rax), %xmm0 + vmovdqa 7*16(%rax), %xmm4 + vmovdqa %xmm0, 6*16(%rsp) + vmovdqa %xmm4, 7*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) + + vmovdqa 8*16(%rax), %xmm0 + vmovdqa 2*16(%rax), %xmm4 + vmovdqa %xmm0, 8*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa 14*16(%rax), %xmm0 + vmovdqa 15*16(%rax), %xmm4 + vmovdqa %xmm0, 14*16(%rsp) + vmovdqa %xmm4, 15*16(%rsp) + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + +sha256d_ms_4way_avx_extend_loop2: + sha256_avx_extend_doubleround 16 + sha256_avx_extend_doubleround 18 + sha256_avx_extend_doubleround 20 + sha256_avx_extend_doubleround 22 + sha256_avx_extend_doubleround 24 + sha256_avx_extend_doubleround 26 + sha256_avx_extend_doubleround 28 + sha256_avx_extend_doubleround 30 + sha256_avx_extend_doubleround 32 + sha256_avx_extend_doubleround 34 + sha256_avx_extend_doubleround 36 + sha256_avx_extend_doubleround 38 + sha256_avx_extend_doubleround 40 + sha256_avx_extend_doubleround 42 + jz sha256d_ms_4way_avx_extend_coda2 + sha256_avx_extend_doubleround 44 + sha256_avx_extend_doubleround 46 + + movdqa 0(%rcx), %xmm7 + movdqa 16(%rcx), %xmm8 + movdqa 32(%rcx), %xmm9 + movdqa 48(%rcx), %xmm10 + movdqa 64(%rcx), %xmm0 + movdqa 80(%rcx), %xmm5 + movdqa 96(%rcx), %xmm4 + movdqa 112(%rcx), %xmm3 + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_avx_main_loop1 + +sha256d_ms_4way_avx_main_loop2: + sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 +sha256d_ms_4way_avx_main_loop1: + sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_avx_main_quadround 4 + sha256_avx_main_quadround 8 + sha256_avx_main_quadround 12 + sha256_avx_main_quadround 16 + sha256_avx_main_quadround 20 + sha256_avx_main_quadround 24 + sha256_avx_main_quadround 28 + sha256_avx_main_quadround 32 + sha256_avx_main_quadround 36 + sha256_avx_main_quadround 40 + sha256_avx_main_quadround 44 + sha256_avx_main_quadround 48 + sha256_avx_main_quadround 52 + sha256_avx_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + jz sha256d_ms_4way_avx_finish + sha256_avx_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_avx_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_avx_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_avx_main_quadround 60 + + movdqa 2*16(%rsp), %xmm1 + movdqa 3*16(%rsp), %xmm2 + movdqa 4*16(%rsp), %xmm6 + movdqa %xmm1, 18*16(%rsi) + movdqa %xmm2, 19*16(%rsi) + movdqa %xmm6, 20*16(%rsi) + movdqa 6*16(%rsp), %xmm1 + movdqa 7*16(%rsp), %xmm2 + movdqa 8*16(%rsp), %xmm6 + movdqa %xmm1, 22*16(%rsi) + movdqa %xmm2, 23*16(%rsi) + movdqa %xmm6, 24*16(%rsi) + movdqa 14*16(%rsp), %xmm1 + movdqa 15*16(%rsp), %xmm2 + movdqa %xmm1, 30*16(%rsi) + movdqa %xmm2, 31*16(%rsi) + + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm8 + paddd 96(%rdx), %xmm9 + paddd 112(%rdx), %xmm10 + + movdqa %xmm7, 0(%rsp) + movdqa %xmm5, 16(%rsp) + movdqa %xmm4, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm0, 64(%rsp) + movdqa %xmm8, 80(%rsp) + movdqa %xmm9, 96(%rsp) + movdqa %xmm10, 112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 128(%rsp) + movdqa %xmm0, 144(%rsp) + movdqa %xmm0, 160(%rsp) + movdqa %xmm0, 176(%rsp) + movdqa %xmm0, 192(%rsp) + movdqa %xmm0, 208(%rsp) + movdqa %xmm0, 224(%rsp) + movdqa %xmm1, 240(%rsp) + + leaq 256(%rsp), %rax + cmpq %rax, %rax + + vmovdqa -15*16(%rax), %xmm0 + vmovdqa -14*16(%rax), %xmm4 + vpslld $14, %xmm0, %xmm2 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpsrld $7, %xmm0, %xmm1 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpsrld $11, %xmm1, %xmm1 + vpsrld $11, %xmm5, %xmm5 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpslld $11, %xmm2, %xmm2 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd -16*16(%rax), %xmm8, %xmm3 + vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7 + vmovdqa %xmm3, 0*16(%rax) + vmovdqa %xmm7, 1*16(%rax) + + sha256_avx_extend_doubleround 2 + sha256_avx_extend_doubleround 4 + + vmovdqa -9*16(%rax), %xmm0 + vpslld $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm8 + vpsrld $7, %xmm0, %xmm1 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpsrld $11, %xmm1, %xmm1 + vpslld $11, %xmm2, %xmm2 + vpxor %xmm1, %xmm8, %xmm8 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4 + vpaddd -10*16(%rax), %xmm8, %xmm0 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpaddd -1*16(%rax), %xmm0, %xmm0 + vpaddd 0*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3 + vpaddd 1*16(%rax), %xmm3, %xmm3 + vpaddd 2*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa sha256d_4preext2_30(%rip), %xmm0 + vmovdqa 0*16(%rax), %xmm4 + vpslld $14, %xmm4, %xmm6 + vpsrld $3, %xmm4, %xmm4 + vpsrld $4, %xmm4, %xmm5 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpsrld $11, %xmm5, %xmm5 + vpslld $11, %xmm6, %xmm6 + vpxor %xmm5, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd -1*16(%rax), %xmm4, %xmm4 + vpslld $13, %xmm3, %xmm2 + vpslld $13, %xmm7, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $7, %xmm3, %xmm1 + vpsrld $7, %xmm7, %xmm5 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpsrld $2, %xmm1, %xmm1 + vpsrld $2, %xmm5, %xmm5 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpslld $2, %xmm2, %xmm2 + vpslld $2, %xmm6, %xmm6 + vpxor %xmm1, %xmm3, %xmm3 + vpxor %xmm5, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + + jmp sha256d_ms_4way_avx_extend_loop2 + +sha256d_ms_4way_avx_extend_coda2: + sha256_avx_extend_round 44 + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm8 + movdqa sha256_4h+96(%rip), %xmm9 + movdqa sha256_4h+112(%rip), %xmm10 + + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_avx_main_loop2 + +.macro sha256_avx_main_round_red i, r0, r1, r2, r3, r4 + vpaddd 16*\i(%rax), \r0, %xmm6 + vpaddd 16*\i(%rcx), %xmm6, %xmm6 + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vpslld $7, \r3, %xmm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %xmm2 + vpxor %xmm1, \r0, \r0 + vpxor %xmm2, \r0, \r0 + vpslld $14, %xmm1, %xmm1 + vpsrld $14, %xmm2, %xmm2 + vpxor %xmm1, \r0, \r0 + vpxor %xmm2, \r0, \r0 + vpslld $5, %xmm1, %xmm1 + vpxor %xmm1, \r0, \r0 + vpaddd \r0, %xmm6, %xmm6 + vpaddd %xmm6, \r4, \r0 +.endm + +sha256d_ms_4way_avx_finish: + sha256_avx_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4 + sha256_avx_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5 + sha256_avx_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7 + sha256_avx_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3 + + paddd sha256_4h+112(%rip), %xmm10 + movdqa %xmm10, 112(%rdi) + + addq $1032, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + +#endif /* USE_AVX */ + + +#if defined(USE_XOP) + + .p2align 6 +sha256d_ms_4way_xop: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $80, %rsp + movdqa %xmm6, 0(%rsp) + movdqa %xmm7, 16(%rsp) + movdqa %xmm8, 32(%rsp) + movdqa %xmm9, 48(%rsp) + movdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + subq $1032, %rsp + + leaq 256(%rsi), %rax + +sha256d_ms_4way_xop_extend_loop1: + vmovdqa 3*16(%rsi), %xmm0 + vmovdqa 2*16(%rax), %xmm3 + vmovdqa 3*16(%rax), %xmm7 + vmovdqa %xmm3, 2*16(%rsp) + vmovdqa %xmm7, 3*16(%rsp) + vpaddd %xmm0, %xmm7, %xmm7 + vprotd $25, %xmm0, %xmm1 + vprotd $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm0 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm0, %xmm0 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 2*16(%rax) + vmovdqa %xmm7, 3*16(%rax) + + vmovdqa 4*16(%rax), %xmm0 + vmovdqa %xmm0, 4*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vmovdqa %xmm3, 4*16(%rax) + vmovdqa %xmm7, 5*16(%rax) + + vmovdqa 6*16(%rax), %xmm0 + vmovdqa 7*16(%rax), %xmm4 + vmovdqa %xmm0, 6*16(%rsp) + vmovdqa %xmm4, 7*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) + + vmovdqa 8*16(%rax), %xmm0 + vmovdqa 2*16(%rax), %xmm4 + vmovdqa %xmm0, 8*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa 14*16(%rax), %xmm0 + vmovdqa 15*16(%rax), %xmm4 + vmovdqa %xmm0, 14*16(%rsp) + vmovdqa %xmm4, 15*16(%rsp) + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + +sha256d_ms_4way_xop_extend_loop2: + sha256_xop_extend_doubleround 16 + sha256_xop_extend_doubleround 18 + sha256_xop_extend_doubleround 20 + sha256_xop_extend_doubleround 22 + sha256_xop_extend_doubleround 24 + sha256_xop_extend_doubleround 26 + sha256_xop_extend_doubleround 28 + sha256_xop_extend_doubleround 30 + sha256_xop_extend_doubleround 32 + sha256_xop_extend_doubleround 34 + sha256_xop_extend_doubleround 36 + sha256_xop_extend_doubleround 38 + sha256_xop_extend_doubleround 40 + sha256_xop_extend_doubleround 42 + jz sha256d_ms_4way_xop_extend_coda2 + sha256_xop_extend_doubleround 44 + sha256_xop_extend_doubleround 46 + + movdqa 0(%rcx), %xmm7 + movdqa 16(%rcx), %xmm8 + movdqa 32(%rcx), %xmm9 + movdqa 48(%rcx), %xmm10 + movdqa 64(%rcx), %xmm0 + movdqa 80(%rcx), %xmm5 + movdqa 96(%rcx), %xmm4 + movdqa 112(%rcx), %xmm3 + + movq %rsi, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_xop_main_loop1 + +sha256d_ms_4way_xop_main_loop2: + sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 +sha256d_ms_4way_xop_main_loop1: + sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_xop_main_quadround 4 + sha256_xop_main_quadround 8 + sha256_xop_main_quadround 12 + sha256_xop_main_quadround 16 + sha256_xop_main_quadround 20 + sha256_xop_main_quadround 24 + sha256_xop_main_quadround 28 + sha256_xop_main_quadround 32 + sha256_xop_main_quadround 36 + sha256_xop_main_quadround 40 + sha256_xop_main_quadround 44 + sha256_xop_main_quadround 48 + sha256_xop_main_quadround 52 + sha256_xop_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + jz sha256d_ms_4way_xop_finish + sha256_xop_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 + sha256_xop_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_xop_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 + sha256_xop_main_quadround 60 + + movdqa 2*16(%rsp), %xmm1 + movdqa 3*16(%rsp), %xmm2 + movdqa 4*16(%rsp), %xmm6 + movdqa %xmm1, 18*16(%rsi) + movdqa %xmm2, 19*16(%rsi) + movdqa %xmm6, 20*16(%rsi) + movdqa 6*16(%rsp), %xmm1 + movdqa 7*16(%rsp), %xmm2 + movdqa 8*16(%rsp), %xmm6 + movdqa %xmm1, 22*16(%rsi) + movdqa %xmm2, 23*16(%rsi) + movdqa %xmm6, 24*16(%rsi) + movdqa 14*16(%rsp), %xmm1 + movdqa 15*16(%rsp), %xmm2 + movdqa %xmm1, 30*16(%rsi) + movdqa %xmm2, 31*16(%rsi) + + paddd 0(%rdx), %xmm7 + paddd 16(%rdx), %xmm5 + paddd 32(%rdx), %xmm4 + paddd 48(%rdx), %xmm3 + paddd 64(%rdx), %xmm0 + paddd 80(%rdx), %xmm8 + paddd 96(%rdx), %xmm9 + paddd 112(%rdx), %xmm10 + + movdqa %xmm7, 0(%rsp) + movdqa %xmm5, 16(%rsp) + movdqa %xmm4, 32(%rsp) + movdqa %xmm3, 48(%rsp) + movdqa %xmm0, 64(%rsp) + movdqa %xmm8, 80(%rsp) + movdqa %xmm9, 96(%rsp) + movdqa %xmm10, 112(%rsp) + + pxor %xmm0, %xmm0 + movq $0x8000000000000100, %rax + movd %rax, %xmm1 + pshufd $0x55, %xmm1, %xmm2 + pshufd $0x00, %xmm1, %xmm1 + movdqa %xmm2, 128(%rsp) + movdqa %xmm0, 144(%rsp) + movdqa %xmm0, 160(%rsp) + movdqa %xmm0, 176(%rsp) + movdqa %xmm0, 192(%rsp) + movdqa %xmm0, 208(%rsp) + movdqa %xmm0, 224(%rsp) + movdqa %xmm1, 240(%rsp) + + leaq 256(%rsp), %rax + cmpq %rax, %rax + + vmovdqa -15*16(%rax), %xmm0 + vmovdqa -14*16(%rax), %xmm4 + vprotd $25, %xmm0, %xmm1 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm0, %xmm2 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm0, %xmm8 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm2, %xmm8, %xmm8 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd %xmm0, %xmm4, %xmm4 + vpaddd -16*16(%rax), %xmm8, %xmm3 + vpaddd sha256d_4preext2_17(%rip), %xmm4, %xmm7 + vmovdqa %xmm3, 0*16(%rax) + vmovdqa %xmm7, 1*16(%rax) + + sha256_xop_extend_doubleround 2 + sha256_xop_extend_doubleround 4 + + vmovdqa -9*16(%rax), %xmm0 + vprotd $25, %xmm0, %xmm1 + vprotd $14, %xmm0, %xmm2 + vpsrld $3, %xmm0, %xmm8 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm2, %xmm8, %xmm8 + vpaddd sha256d_4preext2_23(%rip), %xmm0, %xmm4 + vpaddd -10*16(%rax), %xmm8, %xmm0 + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpaddd -1*16(%rax), %xmm0, %xmm0 + vpaddd 0*16(%rax), %xmm4, %xmm4 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 6*16(%rax) + vmovdqa %xmm7, 7*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd sha256d_4preext2_24(%rip), %xmm3, %xmm3 + vpaddd 1*16(%rax), %xmm3, %xmm3 + vpaddd 2*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 8*16(%rax) + vmovdqa %xmm7, 9*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 3*16(%rax), %xmm3, %xmm3 + vpaddd 4*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 10*16(%rax) + vmovdqa %xmm7, 11*16(%rax) + + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd 5*16(%rax), %xmm3, %xmm3 + vpaddd 6*16(%rax), %xmm7, %xmm7 + vmovdqa %xmm3, 12*16(%rax) + vmovdqa %xmm7, 13*16(%rax) + + vmovdqa sha256d_4preext2_30(%rip), %xmm0 + vmovdqa 0*16(%rax), %xmm4 + vprotd $25, %xmm4, %xmm5 + vprotd $14, %xmm4, %xmm6 + vpxor %xmm5, %xmm6, %xmm6 + vpsrld $3, %xmm4, %xmm4 + vpxor %xmm6, %xmm4, %xmm4 + vpaddd -1*16(%rax), %xmm4, %xmm4 + vprotd $15, %xmm3, %xmm1 + vprotd $15, %xmm7, %xmm5 + vprotd $13, %xmm3, %xmm2 + vprotd $13, %xmm7, %xmm6 + vpxor %xmm1, %xmm2, %xmm2 + vpxor %xmm5, %xmm6, %xmm6 + vpaddd 7*16(%rax), %xmm0, %xmm0 + vpaddd 8*16(%rax), %xmm4, %xmm4 + vpsrld $10, %xmm3, %xmm3 + vpsrld $10, %xmm7, %xmm7 + vpxor %xmm2, %xmm3, %xmm3 + vpxor %xmm6, %xmm7, %xmm7 + vpaddd %xmm0, %xmm3, %xmm3 + vpaddd %xmm4, %xmm7, %xmm7 + vmovdqa %xmm3, 14*16(%rax) + vmovdqa %xmm7, 15*16(%rax) + + jmp sha256d_ms_4way_xop_extend_loop2 + +sha256d_ms_4way_xop_extend_coda2: + sha256_xop_extend_round 44 + + movdqa sha256_4h+0(%rip), %xmm7 + movdqa sha256_4h+16(%rip), %xmm5 + movdqa sha256_4h+32(%rip), %xmm4 + movdqa sha256_4h+48(%rip), %xmm3 + movdqa sha256_4h+64(%rip), %xmm0 + movdqa sha256_4h+80(%rip), %xmm8 + movdqa sha256_4h+96(%rip), %xmm9 + movdqa sha256_4h+112(%rip), %xmm10 + + movq %rsp, %rax + leaq sha256_4k(%rip), %rcx + jmp sha256d_ms_4way_xop_main_loop2 + +.macro sha256_xop_main_round_red i, r0, r1, r2, r3, r4 + vpaddd 16*\i(%rax), \r0, %xmm6 + vpaddd 16*\i(%rcx), %xmm6, %xmm6 + vpandn \r1, \r3, %xmm1 + vpand \r3, \r2, %xmm2 + vpxor %xmm2, %xmm1, %xmm1 + vpaddd %xmm1, %xmm6, %xmm6 + vprotd $26, \r3, %xmm1 + vprotd $21, \r3, %xmm2 + vpxor %xmm1, %xmm2, %xmm2 + vprotd $7, \r3, \r0 + vpxor %xmm2, \r0, \r0 + vpaddd \r0, %xmm6, %xmm6 + vpaddd %xmm6, \r4, \r0 +.endm + +sha256d_ms_4way_xop_finish: + sha256_xop_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4 + sha256_xop_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5 + sha256_xop_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7 + sha256_xop_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3 + + paddd sha256_4h+112(%rip), %xmm10 + movdqa %xmm10, 112(%rdi) + + addq $1032, %rsp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + movdqa 0(%rsp), %xmm6 + movdqa 16(%rsp), %xmm7 + movdqa 32(%rsp), %xmm8 + movdqa 48(%rsp), %xmm9 + movdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + +#endif /* USE_XOP */ + + + .text + .p2align 6 + .globl sha256_use_4way + .globl _sha256_use_4way +sha256_use_4way: +_sha256_use_4way: + pushq %rbx + pushq %rcx + pushq %rdx + +#if defined(USE_AVX) + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne sha256_use_4way_base + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne sha256_use_4way_base +#if defined(USE_XOP) + /* Check for XOP support */ + movl $0x80000001, %eax + cpuid + andl $0x00000800, %ecx + jz sha256_use_4way_avx + +sha256_use_4way_xop: + leaq sha256d_ms_4way_xop(%rip), %rcx + leaq sha256_transform_4way_core_xop(%rip), %rdx + jmp sha256_use_4way_done +#endif /* USE_XOP */ + +sha256_use_4way_avx: + leaq sha256d_ms_4way_avx(%rip), %rcx + leaq sha256_transform_4way_core_avx(%rip), %rdx + jmp sha256_use_4way_done +#endif /* USE_AVX */ + +sha256_use_4way_base: + leaq sha256d_ms_4way_sse2(%rip), %rcx + leaq sha256_transform_4way_core_sse2(%rip), %rdx + +sha256_use_4way_done: + movq %rcx, sha256d_ms_4way_addr(%rip) + movq %rdx, sha256_transform_4way_core_addr(%rip) + popq %rdx + popq %rcx + popq %rbx + movl $1, %eax + ret + + +#if defined(USE_AVX2) + + .text + .p2align 6 + .globl sha256d_ms_8way + .globl _sha256d_ms_8way +sha256d_ms_8way: +_sha256d_ms_8way: +sha256d_ms_8way_avx2: +#if defined(_WIN64) || defined(__CYGWIN__) + pushq %rdi + subq $80, %rsp + vmovdqa %xmm6, 0(%rsp) + vmovdqa %xmm7, 16(%rsp) + vmovdqa %xmm8, 32(%rsp) + vmovdqa %xmm9, 48(%rsp) + vmovdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + pushq %rbp + movq %rsp, %rbp + subq $64*32, %rsp + andq $-128, %rsp + + leaq 16*32(%rsi), %rax + +sha256d_ms_8way_avx2_extend_loop1: + vmovdqa 3*32(%rsi), %ymm0 + vmovdqa 2*32(%rax), %ymm3 + vmovdqa 3*32(%rax), %ymm7 + vmovdqa %ymm3, 2*32(%rsp) + vmovdqa %ymm7, 3*32(%rsp) + vpaddd %ymm0, %ymm7, %ymm7 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm0 + vpsrld $4, %ymm0, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, 2*32(%rax) + vmovdqa %ymm7, 3*32(%rax) + + vmovdqa 4*32(%rax), %ymm0 + vmovdqa %ymm0, 4*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, 4*32(%rax) + vmovdqa %ymm7, 5*32(%rax) + + vmovdqa 6*32(%rax), %ymm0 + vmovdqa 7*32(%rax), %ymm4 + vmovdqa %ymm0, 6*32(%rsp) + vmovdqa %ymm4, 7*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 6*32(%rax) + vmovdqa %ymm7, 7*32(%rax) + + vmovdqa 8*32(%rax), %ymm0 + vmovdqa 2*32(%rax), %ymm4 + vmovdqa %ymm0, 8*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 8*32(%rax) + vmovdqa %ymm7, 9*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 3*32(%rax), %ymm3, %ymm3 + vpaddd 4*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 10*32(%rax) + vmovdqa %ymm7, 11*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 5*32(%rax), %ymm3, %ymm3 + vpaddd 6*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 12*32(%rax) + vmovdqa %ymm7, 13*32(%rax) + + vmovdqa 14*32(%rax), %ymm0 + vmovdqa 15*32(%rax), %ymm4 + vmovdqa %ymm0, 14*32(%rsp) + vmovdqa %ymm4, 15*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpaddd 7*32(%rax), %ymm0, %ymm0 + vpaddd 8*32(%rax), %ymm4, %ymm4 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 14*32(%rax) + vmovdqa %ymm7, 15*32(%rax) + +sha256d_ms_8way_avx2_extend_loop2: + sha256_avx2_extend_doubleround 16 + sha256_avx2_extend_doubleround 18 + sha256_avx2_extend_doubleround 20 + sha256_avx2_extend_doubleround 22 + sha256_avx2_extend_doubleround 24 + sha256_avx2_extend_doubleround 26 + sha256_avx2_extend_doubleround 28 + sha256_avx2_extend_doubleround 30 + sha256_avx2_extend_doubleround 32 + sha256_avx2_extend_doubleround 34 + sha256_avx2_extend_doubleround 36 + sha256_avx2_extend_doubleround 38 + sha256_avx2_extend_doubleround 40 + sha256_avx2_extend_doubleround 42 + jz sha256d_ms_8way_avx2_extend_coda2 + sha256_avx2_extend_doubleround 44 + sha256_avx2_extend_doubleround 46 + + vmovdqa 0(%rcx), %ymm7 + vmovdqa 32(%rcx), %ymm8 + vmovdqa 64(%rcx), %ymm9 + vmovdqa 96(%rcx), %ymm10 + vmovdqa 128(%rcx), %ymm0 + vmovdqa 160(%rcx), %ymm5 + vmovdqa 192(%rcx), %ymm4 + vmovdqa 224(%rcx), %ymm3 + + movq %rsi, %rax + leaq sha256_8k(%rip), %rcx + jmp sha256d_ms_8way_avx2_main_loop1 + +sha256d_ms_8way_avx2_main_loop2: + sha256_avx2_main_round 0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 + sha256_avx2_main_round 1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 + sha256_avx2_main_round 2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 +sha256d_ms_8way_avx2_main_loop1: + sha256_avx2_main_round 3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 + sha256_avx2_main_quadround 4 + sha256_avx2_main_quadround 8 + sha256_avx2_main_quadround 12 + sha256_avx2_main_quadround 16 + sha256_avx2_main_quadround 20 + sha256_avx2_main_quadround 24 + sha256_avx2_main_quadround 28 + sha256_avx2_main_quadround 32 + sha256_avx2_main_quadround 36 + sha256_avx2_main_quadround 40 + sha256_avx2_main_quadround 44 + sha256_avx2_main_quadround 48 + sha256_avx2_main_quadround 52 + sha256_avx2_main_round 56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 + jz sha256d_ms_8way_avx2_finish + sha256_avx2_main_round 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 + sha256_avx2_main_round 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 + sha256_avx2_main_round 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 + sha256_avx2_main_quadround 60 + + vmovdqa 2*32(%rsp), %ymm1 + vmovdqa 3*32(%rsp), %ymm2 + vmovdqa 4*32(%rsp), %ymm6 + vmovdqa %ymm1, 18*32(%rsi) + vmovdqa %ymm2, 19*32(%rsi) + vmovdqa %ymm6, 20*32(%rsi) + vmovdqa 6*32(%rsp), %ymm1 + vmovdqa 7*32(%rsp), %ymm2 + vmovdqa 8*32(%rsp), %ymm6 + vmovdqa %ymm1, 22*32(%rsi) + vmovdqa %ymm2, 23*32(%rsi) + vmovdqa %ymm6, 24*32(%rsi) + vmovdqa 14*32(%rsp), %ymm1 + vmovdqa 15*32(%rsp), %ymm2 + vmovdqa %ymm1, 30*32(%rsi) + vmovdqa %ymm2, 31*32(%rsi) + + vpaddd 0(%rdx), %ymm7, %ymm7 + vpaddd 32(%rdx), %ymm5, %ymm5 + vpaddd 64(%rdx), %ymm4, %ymm4 + vpaddd 96(%rdx), %ymm3, %ymm3 + vpaddd 128(%rdx), %ymm0, %ymm0 + vpaddd 160(%rdx), %ymm8, %ymm8 + vpaddd 192(%rdx), %ymm9, %ymm9 + vpaddd 224(%rdx), %ymm10, %ymm10 + + vmovdqa %ymm7, 0(%rsp) + vmovdqa %ymm5, 32(%rsp) + vmovdqa %ymm4, 64(%rsp) + vmovdqa %ymm3, 96(%rsp) + vmovdqa %ymm0, 128(%rsp) + vmovdqa %ymm8, 160(%rsp) + vmovdqa %ymm9, 192(%rsp) + vmovdqa %ymm10, 224(%rsp) + + vpxor %ymm0, %ymm0, %ymm0 + movq $0x8000000000000100, %rax + vmovd %rax, %xmm1 + vinserti128 $1, %xmm1, %ymm1, %ymm1 + vpshufd $0x55, %ymm1, %ymm2 + vpshufd $0x00, %ymm1, %ymm1 + vmovdqa %ymm2, 8*32(%rsp) + vmovdqa %ymm0, 9*32(%rsp) + vmovdqa %ymm0, 10*32(%rsp) + vmovdqa %ymm0, 11*32(%rsp) + vmovdqa %ymm0, 12*32(%rsp) + vmovdqa %ymm0, 13*32(%rsp) + vmovdqa %ymm0, 14*32(%rsp) + vmovdqa %ymm1, 15*32(%rsp) + + leaq 16*32(%rsp), %rax + cmpq %rax, %rax + + vmovdqa -15*32(%rax), %ymm0 + vmovdqa -14*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd -16*32(%rax), %ymm8, %ymm3 + vpaddd sha256d_8preext2_17(%rip), %ymm4, %ymm7 + vmovdqa %ymm3, 0*32(%rax) + vmovdqa %ymm7, 1*32(%rax) + + sha256_avx2_extend_doubleround 2 + sha256_avx2_extend_doubleround 4 + + vmovdqa -9*32(%rax), %ymm0 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm8 + vpsrld $7, %ymm0, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpaddd sha256d_8preext2_23(%rip), %ymm0, %ymm4 + vpaddd -10*32(%rax), %ymm8, %ymm0 + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpaddd -1*32(%rax), %ymm0, %ymm0 + vpaddd 0*32(%rax), %ymm4, %ymm4 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 6*32(%rax) + vmovdqa %ymm7, 7*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd sha256d_8preext2_24(%rip), %ymm3, %ymm3 + vpaddd 1*32(%rax), %ymm3, %ymm3 + vpaddd 2*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 8*32(%rax) + vmovdqa %ymm7, 9*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 3*32(%rax), %ymm3, %ymm3 + vpaddd 4*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 10*32(%rax) + vmovdqa %ymm7, 11*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 5*32(%rax), %ymm3, %ymm3 + vpaddd 6*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 12*32(%rax) + vmovdqa %ymm7, 13*32(%rax) + + vmovdqa sha256d_8preext2_30(%rip), %ymm0 + vmovdqa 0*32(%rax), %ymm4 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm4, %ymm4 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrld $11, %ymm5, %ymm5 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpaddd -1*32(%rax), %ymm4, %ymm4 + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpaddd 7*32(%rax), %ymm0, %ymm0 + vpaddd 8*32(%rax), %ymm4, %ymm4 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 14*32(%rax) + vmovdqa %ymm7, 15*32(%rax) + + jmp sha256d_ms_8way_avx2_extend_loop2 + +sha256d_ms_8way_avx2_extend_coda2: + sha256_avx2_extend_round 44 + + vmovdqa sha256_8h+0(%rip), %ymm7 + vmovdqa sha256_8h+32(%rip), %ymm5 + vmovdqa sha256_8h+64(%rip), %ymm4 + vmovdqa sha256_8h+96(%rip), %ymm3 + vmovdqa sha256_8h+128(%rip), %ymm0 + vmovdqa sha256_8h+160(%rip), %ymm8 + vmovdqa sha256_8h+192(%rip), %ymm9 + vmovdqa sha256_8h+224(%rip), %ymm10 + + movq %rsp, %rax + leaq sha256_8k(%rip), %rcx + jmp sha256d_ms_8way_avx2_main_loop2 + +.macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4 + vpaddd 32*\i(%rax), \r0, %ymm6 + vpaddd 32*\i(%rcx), %ymm6, %ymm6 + vpandn \r1, \r3, %ymm1 + vpand \r3, \r2, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + vpslld $7, \r3, %ymm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, \r0, \r0 + vpaddd \r0, %ymm6, %ymm6 + vpaddd %ymm6, \r4, \r0 +.endm + +sha256d_ms_8way_avx2_finish: + sha256_avx2_main_round_red 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4 + sha256_avx2_main_round_red 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5 + sha256_avx2_main_round_red 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7 + sha256_avx2_main_round_red 60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3 + + vpaddd sha256_8h+224(%rip), %ymm10, %ymm10 + vmovdqa %ymm10, 224(%rdi) + + movq %rbp, %rsp + popq %rbp +#if defined(_WIN64) || defined(__CYGWIN__) + popq %rsi + vmovdqa 0(%rsp), %xmm6 + vmovdqa 16(%rsp), %xmm7 + vmovdqa 32(%rsp), %xmm8 + vmovdqa 48(%rsp), %xmm9 + vmovdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + + + .text + .p2align 6 + .globl sha256_use_8way + .globl _sha256_use_8way +sha256_use_8way: +_sha256_use_8way: + pushq %rbx + + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne sha256_use_8way_no + /* Check for AVX2 support */ + movl $7, %eax + xorl %ecx, %ecx + cpuid + andl $0x00000020, %ebx + cmpl $0x00000020, %ebx + jne sha256_use_8way_no + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne sha256_use_8way_no + +sha256_use_8way_yes: + movl $1, %eax + jmp sha256_use_8way_done + +sha256_use_8way_no: + xorl %eax, %eax + +sha256_use_8way_done: + popq %rbx + ret + +#endif /* USE_AVX2 */ + +#endif diff --git a/sha2-x86.S b/sha2-x86.S new file mode 100644 index 00000000..89bf4a97 --- /dev/null +++ b/sha2-x86.S @@ -0,0 +1,1193 @@ +/* + * Copyright 2012 pooler@litecoinpool.org + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" + +#if defined(__linux__) && defined(__ELF__) + .section .note.GNU-stack,"",%progbits +#endif + +#if defined(__i386__) + + .data + .p2align 7 +sha256_4h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .data + .p2align 7 +sha256_4k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .data + .p2align 6 +sha256d_4preext2_15: + .long 0x00000100, 0x00000100, 0x00000100, 0x00000100 +sha256d_4preext2_17: + .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 +sha256d_4preext2_23: + .long 0x11002000, 0x11002000, 0x11002000, 0x11002000 +sha256d_4preext2_24: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +sha256d_4preext2_30: + .long 0x00400022, 0x00400022, 0x00400022, 0x00400022 + + + .text + .p2align 5 + .globl sha256_init_4way + .globl _sha256_init_4way +sha256_init_4way: +_sha256_init_4way: + movl 4(%esp), %edx + movdqa sha256_4h+0, %xmm0 + movdqa sha256_4h+16, %xmm1 + movdqa sha256_4h+32, %xmm2 + movdqa sha256_4h+48, %xmm3 + movdqu %xmm0, 0(%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqa sha256_4h+64, %xmm0 + movdqa sha256_4h+80, %xmm1 + movdqa sha256_4h+96, %xmm2 + movdqa sha256_4h+112, %xmm3 + movdqu %xmm0, 64(%edx) + movdqu %xmm1, 80(%edx) + movdqu %xmm2, 96(%edx) + movdqu %xmm3, 112(%edx) + ret + + +.macro sha256_sse2_extend_round i + movdqa (\i-15)*16(%eax), %xmm0 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd (\i-16)*16(%eax), %xmm0 + paddd (\i-7)*16(%eax), %xmm0 + + movdqa %xmm3, %xmm2 + psrld $10, %xmm3 + pslld $13, %xmm2 + movdqa %xmm3, %xmm1 + psrld $7, %xmm1 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + psrld $2, %xmm1 + pslld $2, %xmm2 + pxor %xmm1, %xmm3 + pxor %xmm2, %xmm3 + paddd %xmm0, %xmm3 + movdqa %xmm3, \i*16(%eax) +.endm + +.macro sha256_sse2_extend_doubleround i + movdqa (\i-15)*16(%eax), %xmm0 + movdqa (\i-14)*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd (\i-16)*16(%eax), %xmm0 + paddd (\i-15)*16(%eax), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd (\i-7)*16(%eax), %xmm0 + paddd (\i-6)*16(%eax), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, \i*16(%eax) + movdqa %xmm7, (\i+1)*16(%eax) +.endm + +.macro sha256_sse2_main_round i + movdqa 16*(\i)(%eax), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + paddd 32(%esp), %xmm6 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + paddd 16*(\i)+sha256_4k, %xmm6 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pslld $5, %xmm1 + pxor %xmm2, %xmm0 + pxor %xmm1, %xmm0 + movdqa %xmm5, %xmm1 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + paddd %xmm6, %xmm0 + pand %xmm5, %xmm2 + pand %xmm7, %xmm1 + pand %xmm7, %xmm4 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pslld $9, %xmm2 + pxor %xmm1, %xmm7 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pslld $11, %xmm2 + pxor %xmm1, %xmm7 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 +.endm + +.macro sha256_sse2_main_quadround i + sha256_sse2_main_round \i+0 + sha256_sse2_main_round \i+1 + sha256_sse2_main_round \i+2 + sha256_sse2_main_round \i+3 +.endm + + +.macro p2bswap_esi_esp i + movdqu \i*16(%esi), %xmm0 + movdqu (\i+1)*16(%esi), %xmm2 + pshuflw $0xb1, %xmm0, %xmm0 + pshuflw $0xb1, %xmm2, %xmm2 + pshufhw $0xb1, %xmm0, %xmm0 + pshufhw $0xb1, %xmm2, %xmm2 + movdqa %xmm0, %xmm1 + movdqa %xmm2, %xmm3 + psrlw $8, %xmm1 + psrlw $8, %xmm3 + psllw $8, %xmm0 + psllw $8, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm3, %xmm2 + movdqa %xmm0, (\i+3)*16(%esp) + movdqa %xmm2, (\i+4)*16(%esp) +.endm + + .text + .p2align 5 + .globl sha256_transform_4way + .globl _sha256_transform_4way +sha256_transform_4way: +_sha256_transform_4way: + pushl %edi + pushl %esi + movl 12(%esp), %edi + movl 16(%esp), %esi + movl 20(%esp), %ecx + movl %esp, %edx + subl $67*16, %esp + andl $-128, %esp + + testl %ecx, %ecx + jnz sha256_transform_4way_swap + + movdqu 0*16(%esi), %xmm0 + movdqu 1*16(%esi), %xmm1 + movdqu 2*16(%esi), %xmm2 + movdqu 3*16(%esi), %xmm3 + movdqu 4*16(%esi), %xmm4 + movdqu 5*16(%esi), %xmm5 + movdqu 6*16(%esi), %xmm6 + movdqu 7*16(%esi), %xmm7 + movdqa %xmm0, 3*16(%esp) + movdqa %xmm1, 4*16(%esp) + movdqa %xmm2, 5*16(%esp) + movdqa %xmm3, 6*16(%esp) + movdqa %xmm4, 7*16(%esp) + movdqa %xmm5, 8*16(%esp) + movdqa %xmm6, 9*16(%esp) + movdqa %xmm7, 10*16(%esp) + movdqu 8*16(%esi), %xmm0 + movdqu 9*16(%esi), %xmm1 + movdqu 10*16(%esi), %xmm2 + movdqu 11*16(%esi), %xmm3 + movdqu 12*16(%esi), %xmm4 + movdqu 13*16(%esi), %xmm5 + movdqu 14*16(%esi), %xmm6 + movdqu 15*16(%esi), %xmm7 + movdqa %xmm0, 11*16(%esp) + movdqa %xmm1, 12*16(%esp) + movdqa %xmm2, 13*16(%esp) + movdqa %xmm3, 14*16(%esp) + movdqa %xmm4, 15*16(%esp) + movdqa %xmm5, 16*16(%esp) + movdqa %xmm6, 17*16(%esp) + movdqa %xmm7, 18*16(%esp) + jmp sha256_transform_4way_extend + + .p2align 5 +sha256_transform_4way_swap: + p2bswap_esi_esp 0 + p2bswap_esi_esp 2 + p2bswap_esi_esp 4 + p2bswap_esi_esp 6 + p2bswap_esi_esp 8 + p2bswap_esi_esp 10 + p2bswap_esi_esp 12 + p2bswap_esi_esp 14 + +sha256_transform_4way_extend: + leal 19*16(%esp), %ecx + leal 48*16(%ecx), %eax + movdqa -2*16(%ecx), %xmm3 + movdqa -1*16(%ecx), %xmm7 +sha256_transform_4way_extend_loop: + movdqa -15*16(%ecx), %xmm0 + movdqa -14*16(%ecx), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + + paddd -16*16(%ecx), %xmm0 + paddd -15*16(%ecx), %xmm4 + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + + paddd -7*16(%ecx), %xmm0 + paddd -6*16(%ecx), %xmm4 + + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, (%ecx) + movdqa %xmm7, 16(%ecx) + addl $2*16, %ecx + cmpl %ecx, %eax + jne sha256_transform_4way_extend_loop + + movdqu 0(%edi), %xmm7 + movdqu 16(%edi), %xmm5 + movdqu 32(%edi), %xmm4 + movdqu 48(%edi), %xmm3 + movdqu 64(%edi), %xmm0 + movdqu 80(%edi), %xmm1 + movdqu 96(%edi), %xmm2 + movdqu 112(%edi), %xmm6 + movdqa %xmm1, 0(%esp) + movdqa %xmm2, 16(%esp) + movdqa %xmm6, 32(%esp) + + xorl %eax, %eax +sha256_transform_4way_main_loop: + movdqa 3*16(%esp, %eax), %xmm6 + paddd sha256_4k(%eax), %xmm6 + paddd 32(%esp), %xmm6 + + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + pandn %xmm2, %xmm1 + + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + + paddd %xmm1, %xmm6 + + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm0, %xmm6 + + movdqa %xmm3, %xmm0 + paddd %xmm6, %xmm0 + + movdqa %xmm5, %xmm1 + movdqa %xmm4, %xmm3 + movdqa %xmm4, %xmm2 + pand %xmm5, %xmm2 + pand %xmm7, %xmm4 + pand %xmm7, %xmm1 + pxor %xmm4, %xmm1 + movdqa %xmm5, %xmm4 + movdqa %xmm7, %xmm5 + pxor %xmm2, %xmm1 + paddd %xmm1, %xmm6 + + movdqa %xmm7, %xmm2 + psrld $2, %xmm7 + movdqa %xmm7, %xmm1 + pslld $10, %xmm2 + psrld $11, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $9, %xmm2 + psrld $9, %xmm1 + pxor %xmm2, %xmm7 + pxor %xmm1, %xmm7 + pslld $11, %xmm2 + pxor %xmm2, %xmm7 + paddd %xmm6, %xmm7 + + addl $16, %eax + cmpl $16*64, %eax + jne sha256_transform_4way_main_loop + + movdqu 0(%edi), %xmm1 + movdqu 16(%edi), %xmm2 + paddd %xmm1, %xmm7 + paddd %xmm2, %xmm5 + movdqu 32(%edi), %xmm1 + movdqu 48(%edi), %xmm2 + paddd %xmm1, %xmm4 + paddd %xmm2, %xmm3 + + movdqu %xmm7, 0(%edi) + movdqu %xmm5, 16(%edi) + movdqu %xmm4, 32(%edi) + movdqu %xmm3, 48(%edi) + + movdqu 64(%edi), %xmm1 + movdqu 80(%edi), %xmm2 + movdqu 96(%edi), %xmm6 + movdqu 112(%edi), %xmm7 + paddd %xmm1, %xmm0 + paddd 0(%esp), %xmm2 + paddd 16(%esp), %xmm6 + paddd 32(%esp), %xmm7 + + movdqu %xmm0, 64(%edi) + movdqu %xmm2, 80(%edi) + movdqu %xmm6, 96(%edi) + movdqu %xmm7, 112(%edi) + + movl %edx, %esp + popl %esi + popl %edi + ret + + + .text + .p2align 5 + .globl sha256d_ms_4way + .globl _sha256d_ms_4way +sha256d_ms_4way: +_sha256d_ms_4way: + pushl %edi + pushl %esi + pushl %ebp + movl 16(%esp), %edi + movl 20(%esp), %esi + movl 24(%esp), %edx + movl 28(%esp), %ecx + movl %esp, %ebp + subl $67*16, %esp + andl $-128, %esp + + leal 256(%esi), %eax + +sha256d_ms_4way_extend_loop1: + movdqa 3*16(%esi), %xmm0 + movdqa 2*16(%eax), %xmm3 + movdqa 3*16(%eax), %xmm7 + movdqa %xmm3, 5*16(%esp) + movdqa %xmm7, 6*16(%esp) + movdqa %xmm0, %xmm2 + paddd %xmm0, %xmm7 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd %xmm0, %xmm3 + movdqa %xmm3, 2*16(%eax) + movdqa %xmm7, 3*16(%eax) + + movdqa 4*16(%eax), %xmm0 + movdqa %xmm0, 7*16(%esp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + movdqa %xmm3, 4*16(%eax) + movdqa %xmm7, 5*16(%eax) + + movdqa 6*16(%eax), %xmm0 + movdqa 7*16(%eax), %xmm4 + movdqa %xmm0, 9*16(%esp) + movdqa %xmm4, 10*16(%esp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 6*16(%eax) + movdqa %xmm7, 7*16(%eax) + + movdqa 8*16(%eax), %xmm0 + movdqa 2*16(%eax), %xmm4 + movdqa %xmm0, 11*16(%esp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 8*16(%eax) + movdqa %xmm7, 9*16(%eax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%eax), %xmm3 + paddd 4*16(%eax), %xmm7 + movdqa %xmm3, 10*16(%eax) + movdqa %xmm7, 11*16(%eax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%eax), %xmm3 + paddd 6*16(%eax), %xmm7 + movdqa %xmm3, 12*16(%eax) + movdqa %xmm7, 13*16(%eax) + + movdqa 14*16(%eax), %xmm0 + movdqa 15*16(%eax), %xmm4 + movdqa %xmm0, 17*16(%esp) + movdqa %xmm4, 18*16(%esp) + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%eax), %xmm0 + paddd 8*16(%eax), %xmm4 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%eax) + movdqa %xmm7, 15*16(%eax) + +sha256d_ms_4way_extend_loop2: + sha256_sse2_extend_doubleround 16 + sha256_sse2_extend_doubleround 18 + sha256_sse2_extend_doubleround 20 + sha256_sse2_extend_doubleround 22 + sha256_sse2_extend_doubleround 24 + sha256_sse2_extend_doubleround 26 + sha256_sse2_extend_doubleround 28 + sha256_sse2_extend_doubleround 30 + sha256_sse2_extend_doubleround 32 + sha256_sse2_extend_doubleround 34 + sha256_sse2_extend_doubleround 36 + sha256_sse2_extend_doubleround 38 + sha256_sse2_extend_doubleround 40 + sha256_sse2_extend_doubleround 42 + jz sha256d_ms_4way_extend_coda2 + sha256_sse2_extend_doubleround 44 + sha256_sse2_extend_doubleround 46 + + movdqa 0(%ecx), %xmm3 + movdqa 16(%ecx), %xmm0 + movdqa 32(%ecx), %xmm1 + movdqa 48(%ecx), %xmm2 + movdqa 64(%ecx), %xmm6 + movdqa 80(%ecx), %xmm7 + movdqa 96(%ecx), %xmm5 + movdqa 112(%ecx), %xmm4 + movdqa %xmm1, 0(%esp) + movdqa %xmm2, 16(%esp) + movdqa %xmm6, 32(%esp) + + movl %esi, %eax + jmp sha256d_ms_4way_main_loop1 + +sha256d_ms_4way_main_loop2: + sha256_sse2_main_round 0 + sha256_sse2_main_round 1 + sha256_sse2_main_round 2 +sha256d_ms_4way_main_loop1: + sha256_sse2_main_round 3 + sha256_sse2_main_quadround 4 + sha256_sse2_main_quadround 8 + sha256_sse2_main_quadround 12 + sha256_sse2_main_quadround 16 + sha256_sse2_main_quadround 20 + sha256_sse2_main_quadround 24 + sha256_sse2_main_quadround 28 + sha256_sse2_main_quadround 32 + sha256_sse2_main_quadround 36 + sha256_sse2_main_quadround 40 + sha256_sse2_main_quadround 44 + sha256_sse2_main_quadround 48 + sha256_sse2_main_quadround 52 + sha256_sse2_main_round 56 + jz sha256d_ms_4way_finish + sha256_sse2_main_round 57 + sha256_sse2_main_round 58 + sha256_sse2_main_round 59 + sha256_sse2_main_quadround 60 + + movdqa 5*16(%esp), %xmm1 + movdqa 6*16(%esp), %xmm2 + movdqa 7*16(%esp), %xmm6 + movdqa %xmm1, 18*16(%esi) + movdqa %xmm2, 19*16(%esi) + movdqa %xmm6, 20*16(%esi) + movdqa 9*16(%esp), %xmm1 + movdqa 10*16(%esp), %xmm2 + movdqa 11*16(%esp), %xmm6 + movdqa %xmm1, 22*16(%esi) + movdqa %xmm2, 23*16(%esi) + movdqa %xmm6, 24*16(%esi) + movdqa 17*16(%esp), %xmm1 + movdqa 18*16(%esp), %xmm2 + movdqa %xmm1, 30*16(%esi) + movdqa %xmm2, 31*16(%esi) + + movdqa 0(%esp), %xmm1 + movdqa 16(%esp), %xmm2 + movdqa 32(%esp), %xmm6 + paddd 0(%edx), %xmm7 + paddd 16(%edx), %xmm5 + paddd 32(%edx), %xmm4 + paddd 48(%edx), %xmm3 + paddd 64(%edx), %xmm0 + paddd 80(%edx), %xmm1 + paddd 96(%edx), %xmm2 + paddd 112(%edx), %xmm6 + + movdqa %xmm7, 48+0(%esp) + movdqa %xmm5, 48+16(%esp) + movdqa %xmm4, 48+32(%esp) + movdqa %xmm3, 48+48(%esp) + movdqa %xmm0, 48+64(%esp) + movdqa %xmm1, 48+80(%esp) + movdqa %xmm2, 48+96(%esp) + movdqa %xmm6, 48+112(%esp) + + movdqa sha256d_4preext2_15, %xmm1 + movdqa sha256d_4preext2_24, %xmm2 + pxor %xmm0, %xmm0 + movdqa %xmm2, 48+128(%esp) + movdqa %xmm0, 48+144(%esp) + movdqa %xmm0, 48+160(%esp) + movdqa %xmm0, 48+176(%esp) + movdqa %xmm0, 48+192(%esp) + movdqa %xmm0, 48+208(%esp) + movdqa %xmm0, 48+224(%esp) + movdqa %xmm1, 48+240(%esp) + + leal 19*16(%esp), %eax + cmpl %eax, %eax + + movdqa -15*16(%eax), %xmm0 + movdqa -14*16(%eax), %xmm4 + movdqa %xmm0, %xmm2 + movdqa %xmm4, %xmm6 + psrld $3, %xmm0 + psrld $3, %xmm4 + movdqa %xmm0, %xmm1 + movdqa %xmm4, %xmm5 + pslld $14, %xmm2 + pslld $14, %xmm6 + psrld $4, %xmm1 + psrld $4, %xmm5 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + psrld $11, %xmm1 + psrld $11, %xmm5 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + pslld $11, %xmm2 + pslld $11, %xmm6 + pxor %xmm1, %xmm0 + pxor %xmm5, %xmm4 + pxor %xmm2, %xmm0 + pxor %xmm6, %xmm4 + paddd -16*16(%eax), %xmm0 + paddd -15*16(%eax), %xmm4 + paddd sha256d_4preext2_17, %xmm4 + movdqa %xmm0, %xmm3 + movdqa %xmm4, %xmm7 + movdqa %xmm3, 0*16(%eax) + movdqa %xmm7, 1*16(%eax) + + sha256_sse2_extend_doubleround 2 + sha256_sse2_extend_doubleround 4 + + movdqa -9*16(%eax), %xmm0 + movdqa sha256d_4preext2_23, %xmm4 + movdqa %xmm0, %xmm2 + psrld $3, %xmm0 + movdqa %xmm0, %xmm1 + pslld $14, %xmm2 + psrld $4, %xmm1 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + psrld $11, %xmm1 + pslld $11, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + paddd -10*16(%eax), %xmm0 + paddd -9*16(%eax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd -1*16(%eax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + paddd 0*16(%eax), %xmm4 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 6*16(%eax) + movdqa %xmm7, 7*16(%eax) + + movdqa sha256d_4preext2_24, %xmm0 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 1*16(%eax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd 2*16(%eax), %xmm7 + movdqa %xmm3, 8*16(%eax) + movdqa %xmm7, 9*16(%eax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 3*16(%eax), %xmm3 + paddd 4*16(%eax), %xmm7 + movdqa %xmm3, 10*16(%eax) + movdqa %xmm7, 11*16(%eax) + + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd 5*16(%eax), %xmm3 + paddd 6*16(%eax), %xmm7 + movdqa %xmm3, 12*16(%eax) + movdqa %xmm7, 13*16(%eax) + + movdqa sha256d_4preext2_30, %xmm0 + movdqa 0*16(%eax), %xmm4 + movdqa %xmm4, %xmm6 + psrld $3, %xmm4 + movdqa %xmm4, %xmm5 + pslld $14, %xmm6 + psrld $4, %xmm5 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + psrld $11, %xmm5 + pslld $11, %xmm6 + pxor %xmm5, %xmm4 + pxor %xmm6, %xmm4 + paddd -1*16(%eax), %xmm4 + movdqa %xmm3, %xmm2 + movdqa %xmm7, %xmm6 + psrld $10, %xmm3 + psrld $10, %xmm7 + movdqa %xmm3, %xmm1 + movdqa %xmm7, %xmm5 + paddd 7*16(%eax), %xmm0 + pslld $13, %xmm2 + pslld $13, %xmm6 + psrld $7, %xmm1 + psrld $7, %xmm5 + paddd 8*16(%eax), %xmm4 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + psrld $2, %xmm1 + psrld $2, %xmm5 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + pslld $2, %xmm2 + pslld $2, %xmm6 + pxor %xmm1, %xmm3 + pxor %xmm5, %xmm7 + pxor %xmm2, %xmm3 + pxor %xmm6, %xmm7 + paddd %xmm0, %xmm3 + paddd %xmm4, %xmm7 + movdqa %xmm3, 14*16(%eax) + movdqa %xmm7, 15*16(%eax) + + jmp sha256d_ms_4way_extend_loop2 + +sha256d_ms_4way_extend_coda2: + sha256_sse2_extend_round 44 + + movdqa sha256_4h+0, %xmm7 + movdqa sha256_4h+16, %xmm5 + movdqa sha256_4h+32, %xmm4 + movdqa sha256_4h+48, %xmm3 + movdqa sha256_4h+64, %xmm0 + movdqa sha256_4h+80, %xmm1 + movdqa sha256_4h+96, %xmm2 + movdqa sha256_4h+112, %xmm6 + movdqa %xmm1, 0(%esp) + movdqa %xmm2, 16(%esp) + movdqa %xmm6, 32(%esp) + + leal 48(%esp), %eax + jmp sha256d_ms_4way_main_loop2 + +.macro sha256_sse2_main_round_red i, r7 + movdqa 16*(\i)(%eax), %xmm6 + paddd 16*(\i)+sha256_4k, %xmm6 + paddd 32(%esp), %xmm6 + movdqa %xmm0, %xmm1 + movdqa 16(%esp), %xmm2 + paddd \r7, %xmm6 + pandn %xmm2, %xmm1 + movdqa %xmm2, 32(%esp) + movdqa 0(%esp), %xmm2 + movdqa %xmm2, 16(%esp) + pand %xmm0, %xmm2 + pxor %xmm2, %xmm1 + movdqa %xmm0, 0(%esp) + paddd %xmm1, %xmm6 + movdqa %xmm0, %xmm1 + psrld $6, %xmm0 + movdqa %xmm0, %xmm2 + pslld $7, %xmm1 + psrld $5, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $14, %xmm1 + psrld $14, %xmm2 + pxor %xmm1, %xmm0 + pxor %xmm2, %xmm0 + pslld $5, %xmm1 + pxor %xmm1, %xmm0 + paddd %xmm6, %xmm0 +.endm + +sha256d_ms_4way_finish: + sha256_sse2_main_round_red 57, %xmm3 + sha256_sse2_main_round_red 58, %xmm4 + sha256_sse2_main_round_red 59, %xmm5 + sha256_sse2_main_round_red 60, %xmm7 + + paddd sha256_4h+112, %xmm0 + movdqa %xmm0, 112(%edi) + + movl %ebp, %esp + popl %ebp + popl %esi + popl %edi + ret + + + .text + .p2align 5 + .globl sha256_use_4way + .globl _sha256_use_4way +sha256_use_4way: +_sha256_use_4way: + pushl %ebx + + /* Check for SSE2 availability */ + movl $1, %eax + cpuid + andl $0x04000000, %edx + jnz sha256_use_4way_sse2 + xorl %eax, %eax + popl %ebx + ret + +sha256_use_4way_sse2: + movl $1, %eax + popl %ebx + ret + +#endif diff --git a/sha2.c b/sha2.c new file mode 100644 index 00000000..d13a4951 --- /dev/null +++ b/sha2.c @@ -0,0 +1,630 @@ +/* + * Copyright 2011 ArtForz + * Copyright 2011-2013 pooler + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#include "cpuminer-config.h" +#include "miner.h" + +#include +#include + +#if defined(__arm__) && defined(__APCS_32__) +#define EXTERN_SHA256 +#endif + +static const uint32_t sha256_h[8] = { + 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, + 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 +}; + +static const uint32_t sha256_k[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +void sha256_init(uint32_t *state) +{ + memcpy(state, sha256_h, 32); +} + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ (x >> 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ (x >> 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + do { \ + t0 = h + S1(e) + Ch(e, f, g) + k; \ + t1 = S0(a) + Maj(a, b, c); \ + d += t0; \ + h = t0 + t1; \ + } while (0) + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i] + sha256_k[i]) + +#ifndef EXTERN_SHA256 + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +void sha256_transform(uint32_t *state, const uint32_t *block, int swap) +{ + uint32_t W[64]; + uint32_t S[8]; + uint32_t t0, t1; + int i; + + /* 1. Prepare message schedule W. */ + if (swap) { + for (i = 0; i < 16; i++) + W[i] = swab32(block[i]); + } else + memcpy(W, block, 64); + for (i = 16; i < 64; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; + } + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + RNDr(S, W, 0); + RNDr(S, W, 1); + RNDr(S, W, 2); + RNDr(S, W, 3); + RNDr(S, W, 4); + RNDr(S, W, 5); + RNDr(S, W, 6); + RNDr(S, W, 7); + RNDr(S, W, 8); + RNDr(S, W, 9); + RNDr(S, W, 10); + RNDr(S, W, 11); + RNDr(S, W, 12); + RNDr(S, W, 13); + RNDr(S, W, 14); + RNDr(S, W, 15); + RNDr(S, W, 16); + RNDr(S, W, 17); + RNDr(S, W, 18); + RNDr(S, W, 19); + RNDr(S, W, 20); + RNDr(S, W, 21); + RNDr(S, W, 22); + RNDr(S, W, 23); + RNDr(S, W, 24); + RNDr(S, W, 25); + RNDr(S, W, 26); + RNDr(S, W, 27); + RNDr(S, W, 28); + RNDr(S, W, 29); + RNDr(S, W, 30); + RNDr(S, W, 31); + RNDr(S, W, 32); + RNDr(S, W, 33); + RNDr(S, W, 34); + RNDr(S, W, 35); + RNDr(S, W, 36); + RNDr(S, W, 37); + RNDr(S, W, 38); + RNDr(S, W, 39); + RNDr(S, W, 40); + RNDr(S, W, 41); + RNDr(S, W, 42); + RNDr(S, W, 43); + RNDr(S, W, 44); + RNDr(S, W, 45); + RNDr(S, W, 46); + RNDr(S, W, 47); + RNDr(S, W, 48); + RNDr(S, W, 49); + RNDr(S, W, 50); + RNDr(S, W, 51); + RNDr(S, W, 52); + RNDr(S, W, 53); + RNDr(S, W, 54); + RNDr(S, W, 55); + RNDr(S, W, 56); + RNDr(S, W, 57); + RNDr(S, W, 58); + RNDr(S, W, 59); + RNDr(S, W, 60); + RNDr(S, W, 61); + RNDr(S, W, 62); + RNDr(S, W, 63); + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +#endif /* EXTERN_SHA256 */ + + +static const uint32_t sha256d_hash1[16] = { + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x80000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000100 +}; + +static void sha256d_80_swap(uint32_t *hash, const uint32_t *data) +{ + uint32_t S[16]; + int i; + + sha256_init(S); + sha256_transform(S, data, 0); + sha256_transform(S, data + 16, 0); + memcpy(S + 8, sha256d_hash1 + 8, 32); + sha256_init(hash); + sha256_transform(hash, S, 0); + for (i = 0; i < 8; i++) + hash[i] = swab32(hash[i]); +} + +void sha256d(unsigned char *hash, const unsigned char *data, int len) +{ + uint32_t S[16], T[16]; + int i, r; + + sha256_init(S); + for (r = len; r > -9; r -= 64) { + if (r < 64) + memset(T, 0, 64); + memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r)); + if (r >= 0 && r < 64) + ((unsigned char *)T)[r] = 0x80; + for (i = 0; i < 16; i++) + T[i] = be32dec(T + i); + if (r < 56) + T[15] = 8 * len; + sha256_transform(S, T, 0); + } + memcpy(S + 8, sha256d_hash1 + 8, 32); + sha256_init(T); + sha256_transform(T, S, 0); + for (i = 0; i < 8; i++) + be32enc((uint32_t *)hash + i, T[i]); +} + +static inline void sha256d_preextend(uint32_t *W) +{ + W[16] = s1(W[14]) + W[ 9] + s0(W[ 1]) + W[ 0]; + W[17] = s1(W[15]) + W[10] + s0(W[ 2]) + W[ 1]; + W[18] = s1(W[16]) + W[11] + W[ 2]; + W[19] = s1(W[17]) + W[12] + s0(W[ 4]); + W[20] = W[13] + s0(W[ 5]) + W[ 4]; + W[21] = W[14] + s0(W[ 6]) + W[ 5]; + W[22] = W[15] + s0(W[ 7]) + W[ 6]; + W[23] = W[16] + s0(W[ 8]) + W[ 7]; + W[24] = W[17] + s0(W[ 9]) + W[ 8]; + W[25] = s0(W[10]) + W[ 9]; + W[26] = s0(W[11]) + W[10]; + W[27] = s0(W[12]) + W[11]; + W[28] = s0(W[13]) + W[12]; + W[29] = s0(W[14]) + W[13]; + W[30] = s0(W[15]) + W[14]; + W[31] = s0(W[16]) + W[15]; +} + +static inline void sha256d_prehash(uint32_t *S, const uint32_t *W) +{ + uint32_t t0, t1; + RNDr(S, W, 0); + RNDr(S, W, 1); + RNDr(S, W, 2); +} + +#ifdef EXTERN_SHA256 + +void sha256d_ms(uint32_t *hash, uint32_t *W, + const uint32_t *midstate, const uint32_t *prehash); + +#else + +static inline void sha256d_ms(uint32_t *hash, uint32_t *W, + const uint32_t *midstate, const uint32_t *prehash) +{ + uint32_t S[64]; + uint32_t t0, t1; + int i; + + S[18] = W[18]; + S[19] = W[19]; + S[20] = W[20]; + S[22] = W[22]; + S[23] = W[23]; + S[24] = W[24]; + S[30] = W[30]; + S[31] = W[31]; + + W[18] += s0(W[3]); + W[19] += W[3]; + W[20] += s1(W[18]); + W[21] = s1(W[19]); + W[22] += s1(W[20]); + W[23] += s1(W[21]); + W[24] += s1(W[22]); + W[25] = s1(W[23]) + W[18]; + W[26] = s1(W[24]) + W[19]; + W[27] = s1(W[25]) + W[20]; + W[28] = s1(W[26]) + W[21]; + W[29] = s1(W[27]) + W[22]; + W[30] += s1(W[28]) + W[23]; + W[31] += s1(W[29]) + W[24]; + for (i = 32; i < 64; i += 2) { + W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16]; + W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15]; + } + + memcpy(S, prehash, 32); + + RNDr(S, W, 3); + RNDr(S, W, 4); + RNDr(S, W, 5); + RNDr(S, W, 6); + RNDr(S, W, 7); + RNDr(S, W, 8); + RNDr(S, W, 9); + RNDr(S, W, 10); + RNDr(S, W, 11); + RNDr(S, W, 12); + RNDr(S, W, 13); + RNDr(S, W, 14); + RNDr(S, W, 15); + RNDr(S, W, 16); + RNDr(S, W, 17); + RNDr(S, W, 18); + RNDr(S, W, 19); + RNDr(S, W, 20); + RNDr(S, W, 21); + RNDr(S, W, 22); + RNDr(S, W, 23); + RNDr(S, W, 24); + RNDr(S, W, 25); + RNDr(S, W, 26); + RNDr(S, W, 27); + RNDr(S, W, 28); + RNDr(S, W, 29); + RNDr(S, W, 30); + RNDr(S, W, 31); + RNDr(S, W, 32); + RNDr(S, W, 33); + RNDr(S, W, 34); + RNDr(S, W, 35); + RNDr(S, W, 36); + RNDr(S, W, 37); + RNDr(S, W, 38); + RNDr(S, W, 39); + RNDr(S, W, 40); + RNDr(S, W, 41); + RNDr(S, W, 42); + RNDr(S, W, 43); + RNDr(S, W, 44); + RNDr(S, W, 45); + RNDr(S, W, 46); + RNDr(S, W, 47); + RNDr(S, W, 48); + RNDr(S, W, 49); + RNDr(S, W, 50); + RNDr(S, W, 51); + RNDr(S, W, 52); + RNDr(S, W, 53); + RNDr(S, W, 54); + RNDr(S, W, 55); + RNDr(S, W, 56); + RNDr(S, W, 57); + RNDr(S, W, 58); + RNDr(S, W, 59); + RNDr(S, W, 60); + RNDr(S, W, 61); + RNDr(S, W, 62); + RNDr(S, W, 63); + + for (i = 0; i < 8; i++) + S[i] += midstate[i]; + + W[18] = S[18]; + W[19] = S[19]; + W[20] = S[20]; + W[22] = S[22]; + W[23] = S[23]; + W[24] = S[24]; + W[30] = S[30]; + W[31] = S[31]; + + memcpy(S + 8, sha256d_hash1 + 8, 32); + S[16] = s1(sha256d_hash1[14]) + sha256d_hash1[ 9] + s0(S[ 1]) + S[ 0]; + S[17] = s1(sha256d_hash1[15]) + sha256d_hash1[10] + s0(S[ 2]) + S[ 1]; + S[18] = s1(S[16]) + sha256d_hash1[11] + s0(S[ 3]) + S[ 2]; + S[19] = s1(S[17]) + sha256d_hash1[12] + s0(S[ 4]) + S[ 3]; + S[20] = s1(S[18]) + sha256d_hash1[13] + s0(S[ 5]) + S[ 4]; + S[21] = s1(S[19]) + sha256d_hash1[14] + s0(S[ 6]) + S[ 5]; + S[22] = s1(S[20]) + sha256d_hash1[15] + s0(S[ 7]) + S[ 6]; + S[23] = s1(S[21]) + S[16] + s0(sha256d_hash1[ 8]) + S[ 7]; + S[24] = s1(S[22]) + S[17] + s0(sha256d_hash1[ 9]) + sha256d_hash1[ 8]; + S[25] = s1(S[23]) + S[18] + s0(sha256d_hash1[10]) + sha256d_hash1[ 9]; + S[26] = s1(S[24]) + S[19] + s0(sha256d_hash1[11]) + sha256d_hash1[10]; + S[27] = s1(S[25]) + S[20] + s0(sha256d_hash1[12]) + sha256d_hash1[11]; + S[28] = s1(S[26]) + S[21] + s0(sha256d_hash1[13]) + sha256d_hash1[12]; + S[29] = s1(S[27]) + S[22] + s0(sha256d_hash1[14]) + sha256d_hash1[13]; + S[30] = s1(S[28]) + S[23] + s0(sha256d_hash1[15]) + sha256d_hash1[14]; + S[31] = s1(S[29]) + S[24] + s0(S[16]) + sha256d_hash1[15]; + for (i = 32; i < 60; i += 2) { + S[i] = s1(S[i - 2]) + S[i - 7] + s0(S[i - 15]) + S[i - 16]; + S[i+1] = s1(S[i - 1]) + S[i - 6] + s0(S[i - 14]) + S[i - 15]; + } + S[60] = s1(S[58]) + S[53] + s0(S[45]) + S[44]; + + sha256_init(hash); + + RNDr(hash, S, 0); + RNDr(hash, S, 1); + RNDr(hash, S, 2); + RNDr(hash, S, 3); + RNDr(hash, S, 4); + RNDr(hash, S, 5); + RNDr(hash, S, 6); + RNDr(hash, S, 7); + RNDr(hash, S, 8); + RNDr(hash, S, 9); + RNDr(hash, S, 10); + RNDr(hash, S, 11); + RNDr(hash, S, 12); + RNDr(hash, S, 13); + RNDr(hash, S, 14); + RNDr(hash, S, 15); + RNDr(hash, S, 16); + RNDr(hash, S, 17); + RNDr(hash, S, 18); + RNDr(hash, S, 19); + RNDr(hash, S, 20); + RNDr(hash, S, 21); + RNDr(hash, S, 22); + RNDr(hash, S, 23); + RNDr(hash, S, 24); + RNDr(hash, S, 25); + RNDr(hash, S, 26); + RNDr(hash, S, 27); + RNDr(hash, S, 28); + RNDr(hash, S, 29); + RNDr(hash, S, 30); + RNDr(hash, S, 31); + RNDr(hash, S, 32); + RNDr(hash, S, 33); + RNDr(hash, S, 34); + RNDr(hash, S, 35); + RNDr(hash, S, 36); + RNDr(hash, S, 37); + RNDr(hash, S, 38); + RNDr(hash, S, 39); + RNDr(hash, S, 40); + RNDr(hash, S, 41); + RNDr(hash, S, 42); + RNDr(hash, S, 43); + RNDr(hash, S, 44); + RNDr(hash, S, 45); + RNDr(hash, S, 46); + RNDr(hash, S, 47); + RNDr(hash, S, 48); + RNDr(hash, S, 49); + RNDr(hash, S, 50); + RNDr(hash, S, 51); + RNDr(hash, S, 52); + RNDr(hash, S, 53); + RNDr(hash, S, 54); + RNDr(hash, S, 55); + RNDr(hash, S, 56); + + hash[2] += hash[6] + S1(hash[3]) + Ch(hash[3], hash[4], hash[5]) + + S[57] + sha256_k[57]; + hash[1] += hash[5] + S1(hash[2]) + Ch(hash[2], hash[3], hash[4]) + + S[58] + sha256_k[58]; + hash[0] += hash[4] + S1(hash[1]) + Ch(hash[1], hash[2], hash[3]) + + S[59] + sha256_k[59]; + hash[7] += hash[3] + S1(hash[0]) + Ch(hash[0], hash[1], hash[2]) + + S[60] + sha256_k[60] + + sha256_h[7]; +} + +#endif /* EXTERN_SHA256 */ + +#ifdef HAVE_SHA256_4WAY + +void sha256d_ms_4way(uint32_t *hash, uint32_t *data, + const uint32_t *midstate, const uint32_t *prehash); + +static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[4 * 64] __attribute__((aligned(128))); + uint32_t hash[4 * 8] __attribute__((aligned(32))); + uint32_t midstate[4 * 8] __attribute__((aligned(32))); + uint32_t prehash[4 * 8] __attribute__((aligned(32))); + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + int i, j; + + memcpy(data, pdata + 16, 64); + sha256d_preextend(data); + for (i = 31; i >= 0; i--) + for (j = 0; j < 4; j++) + data[i * 4 + j] = data[i]; + + sha256_init(midstate); + sha256_transform(midstate, pdata, 0); + memcpy(prehash, midstate, 32); + sha256d_prehash(prehash, pdata + 16); + for (i = 7; i >= 0; i--) { + for (j = 0; j < 4; j++) { + midstate[i * 4 + j] = midstate[i]; + prehash[i * 4 + j] = prehash[i]; + } + } + + do { + for (i = 0; i < 4; i++) + data[4 * 3 + i] = ++n; + + sha256d_ms_4way(hash, data, midstate, prehash); + + for (i = 0; i < 4; i++) { + if (swab32(hash[4 * 7 + i]) <= Htarg) { + pdata[19] = data[4 * 3 + i]; + sha256d_80_swap(hash, pdata); + if (fulltest(hash, ptarget)) { + *hashes_done = n - first_nonce + 1; + return 1; + } + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} + +#endif /* HAVE_SHA256_4WAY */ + +#ifdef HAVE_SHA256_8WAY + +void sha256d_ms_8way(uint32_t *hash, uint32_t *data, + const uint32_t *midstate, const uint32_t *prehash); + +static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[8 * 64] __attribute__((aligned(128))); + uint32_t hash[8 * 8] __attribute__((aligned(32))); + uint32_t midstate[8 * 8] __attribute__((aligned(32))); + uint32_t prehash[8 * 8] __attribute__((aligned(32))); + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + int i, j; + + memcpy(data, pdata + 16, 64); + sha256d_preextend(data); + for (i = 31; i >= 0; i--) + for (j = 0; j < 8; j++) + data[i * 8 + j] = data[i]; + + sha256_init(midstate); + sha256_transform(midstate, pdata, 0); + memcpy(prehash, midstate, 32); + sha256d_prehash(prehash, pdata + 16); + for (i = 7; i >= 0; i--) { + for (j = 0; j < 8; j++) { + midstate[i * 8 + j] = midstate[i]; + prehash[i * 8 + j] = prehash[i]; + } + } + + do { + for (i = 0; i < 8; i++) + data[8 * 3 + i] = ++n; + + sha256d_ms_8way(hash, data, midstate, prehash); + + for (i = 0; i < 8; i++) { + if (swab32(hash[8 * 7 + i]) <= Htarg) { + pdata[19] = data[8 * 3 + i]; + sha256d_80_swap(hash, pdata); + if (fulltest(hash, ptarget)) { + *hashes_done = n - first_nonce + 1; + return 1; + } + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} + +#endif /* HAVE_SHA256_8WAY */ + +int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, + uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[64] __attribute__((aligned(128))); + uint32_t hash[8] __attribute__((aligned(32))); + uint32_t midstate[8] __attribute__((aligned(32))); + uint32_t prehash[8] __attribute__((aligned(32))); + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + +#ifdef HAVE_SHA256_8WAY + if (sha256_use_8way()) + return scanhash_sha256d_8way(thr_id, pdata, ptarget, + max_nonce, hashes_done); +#endif +#ifdef HAVE_SHA256_4WAY + if (sha256_use_4way()) + return scanhash_sha256d_4way(thr_id, pdata, ptarget, + max_nonce, hashes_done); +#endif + + memcpy(data, pdata + 16, 64); + sha256d_preextend(data); + + sha256_init(midstate); + sha256_transform(midstate, pdata, 0); + memcpy(prehash, midstate, 32); + sha256d_prehash(prehash, pdata + 16); + + do { + data[3] = ++n; + sha256d_ms(hash, data, midstate, prehash); + if (swab32(hash[7]) <= Htarg) { + pdata[19] = data[3]; + sha256d_80_swap(hash, pdata); + if (fulltest(hash, ptarget)) { + *hashes_done = n - first_nonce + 1; + return 1; + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} diff --git a/sha3/aes_helper.c b/sha3/aes_helper.c new file mode 100644 index 00000000..75b7cc69 --- /dev/null +++ b/sha3/aes_helper.c @@ -0,0 +1,392 @@ +/* $Id: aes_helper.c 220 2010-06-09 09:21:50Z tp $ */ +/* + * AES tables. This file is not meant to be compiled by itself; it + * is included by some hash function implementations. It contains + * the precomputed tables and helper macros for evaluating an AES + * round, optionally with a final XOR with a subkey. + * + * By default, this file defines the tables and macros for little-endian + * processing (i.e. it is assumed that the input bytes have been read + * from memory and assembled with the little-endian convention). If + * the 'AES_BIG_ENDIAN' macro is defined (to a non-zero integer value) + * when this file is included, then the tables and macros for big-endian + * processing are defined instead. The big-endian tables and macros have + * names distinct from the little-endian tables and macros, hence it is + * possible to have both simultaneously, by including this file twice + * (with and without the AES_BIG_ENDIAN macro). + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include "sph_types.h" +#ifdef __cplusplus +extern "C"{ +#endif +#if AES_BIG_ENDIAN + +#define AESx(x) ( ((SPH_C32(x) >> 24) & SPH_C32(0x000000FF)) \ + | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ + | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ + | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) + +#define AES0 AES0_BE +#define AES1 AES1_BE +#define AES2 AES2_BE +#define AES3 AES3_BE + +#define AES_ROUND_BE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3) do { \ + (Y0) = AES0[((X0) >> 24) & 0xFF] \ + ^ AES1[((X1) >> 16) & 0xFF] \ + ^ AES2[((X2) >> 8) & 0xFF] \ + ^ AES3[(X3) & 0xFF] ^ (K0); \ + (Y1) = AES0[((X1) >> 24) & 0xFF] \ + ^ AES1[((X2) >> 16) & 0xFF] \ + ^ AES2[((X3) >> 8) & 0xFF] \ + ^ AES3[(X0) & 0xFF] ^ (K1); \ + (Y2) = AES0[((X2) >> 24) & 0xFF] \ + ^ AES1[((X3) >> 16) & 0xFF] \ + ^ AES2[((X0) >> 8) & 0xFF] \ + ^ AES3[(X1) & 0xFF] ^ (K2); \ + (Y3) = AES0[((X3) >> 24) & 0xFF] \ + ^ AES1[((X0) >> 16) & 0xFF] \ + ^ AES2[((X1) >> 8) & 0xFF] \ + ^ AES3[(X2) & 0xFF] ^ (K3); \ + } while (0) + +#define AES_ROUND_NOKEY_BE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \ + AES_ROUND_BE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3) + +#else + +#define AESx(x) SPH_C32(x) +#define AES0 AES0_LE +#define AES1 AES1_LE +#define AES2 AES2_LE +#define AES3 AES3_LE + +#define AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3) do { \ + (Y0) = AES0[(X0) & 0xFF] \ + ^ AES1[((X1) >> 8) & 0xFF] \ + ^ AES2[((X2) >> 16) & 0xFF] \ + ^ AES3[((X3) >> 24) & 0xFF] ^ (K0); \ + (Y1) = AES0[(X1) & 0xFF] \ + ^ AES1[((X2) >> 8) & 0xFF] \ + ^ AES2[((X3) >> 16) & 0xFF] \ + ^ AES3[((X0) >> 24) & 0xFF] ^ (K1); \ + (Y2) = AES0[(X2) & 0xFF] \ + ^ AES1[((X3) >> 8) & 0xFF] \ + ^ AES2[((X0) >> 16) & 0xFF] \ + ^ AES3[((X1) >> 24) & 0xFF] ^ (K2); \ + (Y3) = AES0[(X3) & 0xFF] \ + ^ AES1[((X0) >> 8) & 0xFF] \ + ^ AES2[((X1) >> 16) & 0xFF] \ + ^ AES3[((X2) >> 24) & 0xFF] ^ (K3); \ + } while (0) + +#define AES_ROUND_NOKEY_LE(X0, X1, X2, X3, Y0, Y1, Y2, Y3) \ + AES_ROUND_LE(X0, X1, X2, X3, 0, 0, 0, 0, Y0, Y1, Y2, Y3) + +#endif + +/* + * The AES*[] tables allow us to perform a fast evaluation of an AES + * round; table AESi[] combines SubBytes for a byte at row i, and + * MixColumns for the column where that byte goes after ShiftRows. + */ + +static const sph_u32 AES0[256] = { + AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6), + AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591), + AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56), + AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC), + AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA), + AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB), + AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45), + AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B), + AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C), + AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83), + AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9), + AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A), + AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D), + AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F), + AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF), + AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA), + AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34), + AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B), + AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D), + AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413), + AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1), + AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6), + AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972), + AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85), + AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED), + AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511), + AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE), + AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B), + AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05), + AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1), + AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142), + AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF), + AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3), + AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E), + AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A), + AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6), + AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3), + AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B), + AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428), + AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD), + AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14), + AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8), + AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4), + AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2), + AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA), + AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949), + AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF), + AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810), + AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C), + AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697), + AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E), + AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F), + AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC), + AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C), + AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969), + AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27), + AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122), + AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433), + AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9), + AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5), + AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A), + AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0), + AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E), + AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C) +}; + +static const sph_u32 AES1[256] = { + AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), + AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), + AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), + AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A), + AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87), + AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B), + AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA), + AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B), + AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A), + AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F), + AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908), + AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F), + AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E), + AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5), + AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D), + AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F), + AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E), + AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB), + AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE), + AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397), + AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C), + AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED), + AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B), + AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A), + AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16), + AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194), + AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81), + AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3), + AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A), + AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104), + AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263), + AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D), + AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F), + AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39), + AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47), + AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695), + AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F), + AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83), + AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C), + AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76), + AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E), + AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4), + AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6), + AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B), + AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7), + AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0), + AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25), + AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018), + AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72), + AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751), + AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21), + AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85), + AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA), + AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12), + AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0), + AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9), + AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233), + AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7), + AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920), + AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A), + AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17), + AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8), + AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11), + AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) +}; + +static const sph_u32 AES2[256] = { + AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), + AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), + AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), + AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76), + AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D), + AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0), + AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF), + AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0), + AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26), + AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC), + AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1), + AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15), + AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3), + AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A), + AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2), + AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75), + AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A), + AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0), + AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3), + AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784), + AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED), + AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B), + AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39), + AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF), + AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB), + AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485), + AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F), + AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8), + AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F), + AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5), + AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321), + AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2), + AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC), + AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917), + AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D), + AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573), + AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC), + AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388), + AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14), + AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB), + AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A), + AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C), + AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662), + AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79), + AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D), + AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9), + AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA), + AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808), + AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E), + AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6), + AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F), + AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A), + AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66), + AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E), + AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9), + AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E), + AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311), + AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794), + AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9), + AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF), + AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D), + AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868), + AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F), + AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) +}; + +static const sph_u32 AES3[256] = { + AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B), + AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5), + AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B), + AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676), + AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D), + AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0), + AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF), + AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0), + AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626), + AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC), + AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1), + AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515), + AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3), + AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A), + AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2), + AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575), + AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A), + AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0), + AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3), + AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484), + AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED), + AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B), + AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939), + AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF), + AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB), + AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585), + AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F), + AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8), + AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F), + AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5), + AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121), + AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2), + AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC), + AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717), + AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D), + AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373), + AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC), + AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888), + AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414), + AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB), + AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A), + AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C), + AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262), + AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979), + AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D), + AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9), + AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA), + AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808), + AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E), + AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6), + AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F), + AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A), + AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666), + AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E), + AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9), + AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E), + AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111), + AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494), + AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9), + AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF), + AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D), + AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868), + AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F), + AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616) +}; + +#ifdef __cplusplus +} +#endif diff --git a/sha3/sph_blake.c b/sha3/sph_blake.c new file mode 100644 index 00000000..d8a651e4 --- /dev/null +++ b/sha3/sph_blake.c @@ -0,0 +1,1114 @@ +/* $Id: blake.c 252 2011-06-07 17:55:14Z tp $ */ +/* + * BLAKE implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_blake.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BLAKE +#define SPH_SMALL_FOOTPRINT_BLAKE 1 +#endif + +#if SPH_SMALL_FOOTPRINT_BLAKE +#define SPH_COMPACT_BLAKE_32 1 +#endif + +#if SPH_64 && (SPH_SMALL_FOOTPRINT_BLAKE || !SPH_64_TRUE) +#define SPH_COMPACT_BLAKE_64 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static const sph_u32 IV224[8] = { + SPH_C32(0xC1059ED8), SPH_C32(0x367CD507), + SPH_C32(0x3070DD17), SPH_C32(0xF70E5939), + SPH_C32(0xFFC00B31), SPH_C32(0x68581511), + SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4) +}; + +static const sph_u32 IV256[8] = { + SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), + SPH_C32(0x3C6EF372), SPH_C32(0xA54FF53A), + SPH_C32(0x510E527F), SPH_C32(0x9B05688C), + SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19) +}; + +#if SPH_64 + +static const sph_u64 IV384[8] = { + SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507), + SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939), + SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511), + SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4) +}; + +static const sph_u64 IV512[8] = { + SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), + SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), + SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), + SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) +}; + +#endif + +#if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64 + +static const unsigned sigma[16][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +}; + +/* + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + 14 10 4 8 9 15 13 6 1 12 0 2 11 7 5 3 + 11 8 12 0 5 2 15 13 10 14 3 6 7 1 9 4 + 7 9 3 1 13 12 11 14 2 6 5 10 4 0 15 8 + 9 0 5 7 2 4 10 15 14 1 11 12 6 8 3 13 + 2 12 6 10 0 11 8 3 4 13 7 5 15 14 1 9 + 12 5 1 15 14 13 4 10 0 7 6 3 9 2 8 11 + 13 11 7 14 12 1 3 9 5 0 15 4 8 6 2 10 + 6 15 14 9 11 3 0 8 12 2 13 7 1 4 10 5 + 10 2 8 4 7 6 1 5 15 11 9 14 3 12 13 0 +*/ +#endif + +#define Z00 0 +#define Z01 1 +#define Z02 2 +#define Z03 3 +#define Z04 4 +#define Z05 5 +#define Z06 6 +#define Z07 7 +#define Z08 8 +#define Z09 9 +#define Z0A A +#define Z0B B +#define Z0C C +#define Z0D D +#define Z0E E +#define Z0F F + +#define Z10 E +#define Z11 A +#define Z12 4 +#define Z13 8 +#define Z14 9 +#define Z15 F +#define Z16 D +#define Z17 6 +#define Z18 1 +#define Z19 C +#define Z1A 0 +#define Z1B 2 +#define Z1C B +#define Z1D 7 +#define Z1E 5 +#define Z1F 3 + +#define Z20 B +#define Z21 8 +#define Z22 C +#define Z23 0 +#define Z24 5 +#define Z25 2 +#define Z26 F +#define Z27 D +#define Z28 A +#define Z29 E +#define Z2A 3 +#define Z2B 6 +#define Z2C 7 +#define Z2D 1 +#define Z2E 9 +#define Z2F 4 + +#define Z30 7 +#define Z31 9 +#define Z32 3 +#define Z33 1 +#define Z34 D +#define Z35 C +#define Z36 B +#define Z37 E +#define Z38 2 +#define Z39 6 +#define Z3A 5 +#define Z3B A +#define Z3C 4 +#define Z3D 0 +#define Z3E F +#define Z3F 8 + +#define Z40 9 +#define Z41 0 +#define Z42 5 +#define Z43 7 +#define Z44 2 +#define Z45 4 +#define Z46 A +#define Z47 F +#define Z48 E +#define Z49 1 +#define Z4A B +#define Z4B C +#define Z4C 6 +#define Z4D 8 +#define Z4E 3 +#define Z4F D + +#define Z50 2 +#define Z51 C +#define Z52 6 +#define Z53 A +#define Z54 0 +#define Z55 B +#define Z56 8 +#define Z57 3 +#define Z58 4 +#define Z59 D +#define Z5A 7 +#define Z5B 5 +#define Z5C F +#define Z5D E +#define Z5E 1 +#define Z5F 9 + +#define Z60 C +#define Z61 5 +#define Z62 1 +#define Z63 F +#define Z64 E +#define Z65 D +#define Z66 4 +#define Z67 A +#define Z68 0 +#define Z69 7 +#define Z6A 6 +#define Z6B 3 +#define Z6C 9 +#define Z6D 2 +#define Z6E 8 +#define Z6F B + +#define Z70 D +#define Z71 B +#define Z72 7 +#define Z73 E +#define Z74 C +#define Z75 1 +#define Z76 3 +#define Z77 9 +#define Z78 5 +#define Z79 0 +#define Z7A F +#define Z7B 4 +#define Z7C 8 +#define Z7D 6 +#define Z7E 2 +#define Z7F A + +#define Z80 6 +#define Z81 F +#define Z82 E +#define Z83 9 +#define Z84 B +#define Z85 3 +#define Z86 0 +#define Z87 8 +#define Z88 C +#define Z89 2 +#define Z8A D +#define Z8B 7 +#define Z8C 1 +#define Z8D 4 +#define Z8E A +#define Z8F 5 + +#define Z90 A +#define Z91 2 +#define Z92 8 +#define Z93 4 +#define Z94 7 +#define Z95 6 +#define Z96 1 +#define Z97 5 +#define Z98 F +#define Z99 B +#define Z9A 9 +#define Z9B E +#define Z9C 3 +#define Z9D C +#define Z9E D +#define Z9F 0 + +#define Mx(r, i) Mx_(Z ## r ## i) +#define Mx_(n) Mx__(n) +#define Mx__(n) M ## n + +#define CSx(r, i) CSx_(Z ## r ## i) +#define CSx_(n) CSx__(n) +#define CSx__(n) CS ## n + +#define CS0 SPH_C32(0x243F6A88) +#define CS1 SPH_C32(0x85A308D3) +#define CS2 SPH_C32(0x13198A2E) +#define CS3 SPH_C32(0x03707344) +#define CS4 SPH_C32(0xA4093822) +#define CS5 SPH_C32(0x299F31D0) +#define CS6 SPH_C32(0x082EFA98) +#define CS7 SPH_C32(0xEC4E6C89) +#define CS8 SPH_C32(0x452821E6) +#define CS9 SPH_C32(0x38D01377) +#define CSA SPH_C32(0xBE5466CF) +#define CSB SPH_C32(0x34E90C6C) +#define CSC SPH_C32(0xC0AC29B7) +#define CSD SPH_C32(0xC97C50DD) +#define CSE SPH_C32(0x3F84D5B5) +#define CSF SPH_C32(0xB5470917) + +#if SPH_COMPACT_BLAKE_32 + +static const sph_u32 CS[16] = { + SPH_C32(0x243F6A88), SPH_C32(0x85A308D3), + SPH_C32(0x13198A2E), SPH_C32(0x03707344), + SPH_C32(0xA4093822), SPH_C32(0x299F31D0), + SPH_C32(0x082EFA98), SPH_C32(0xEC4E6C89), + SPH_C32(0x452821E6), SPH_C32(0x38D01377), + SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C), + SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), + SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917) +}; + +#endif + +#if SPH_64 + +#define CBx(r, i) CBx_(Z ## r ## i) +#define CBx_(n) CBx__(n) +#define CBx__(n) CB ## n + +#define CB0 SPH_C64(0x243F6A8885A308D3) +#define CB1 SPH_C64(0x13198A2E03707344) +#define CB2 SPH_C64(0xA4093822299F31D0) +#define CB3 SPH_C64(0x082EFA98EC4E6C89) +#define CB4 SPH_C64(0x452821E638D01377) +#define CB5 SPH_C64(0xBE5466CF34E90C6C) +#define CB6 SPH_C64(0xC0AC29B7C97C50DD) +#define CB7 SPH_C64(0x3F84D5B5B5470917) +#define CB8 SPH_C64(0x9216D5D98979FB1B) +#define CB9 SPH_C64(0xD1310BA698DFB5AC) +#define CBA SPH_C64(0x2FFD72DBD01ADFB7) +#define CBB SPH_C64(0xB8E1AFED6A267E96) +#define CBC SPH_C64(0xBA7C9045F12C7F99) +#define CBD SPH_C64(0x24A19947B3916CF7) +#define CBE SPH_C64(0x0801F2E2858EFC16) +#define CBF SPH_C64(0x636920D871574E69) + +#if SPH_COMPACT_BLAKE_64 + +static const sph_u64 CB[16] = { + SPH_C64(0x243F6A8885A308D3), SPH_C64(0x13198A2E03707344), + SPH_C64(0xA4093822299F31D0), SPH_C64(0x082EFA98EC4E6C89), + SPH_C64(0x452821E638D01377), SPH_C64(0xBE5466CF34E90C6C), + SPH_C64(0xC0AC29B7C97C50DD), SPH_C64(0x3F84D5B5B5470917), + SPH_C64(0x9216D5D98979FB1B), SPH_C64(0xD1310BA698DFB5AC), + SPH_C64(0x2FFD72DBD01ADFB7), SPH_C64(0xB8E1AFED6A267E96), + SPH_C64(0xBA7C9045F12C7F99), SPH_C64(0x24A19947B3916CF7), + SPH_C64(0x0801F2E2858EFC16), SPH_C64(0x636920D871574E69) +}; + +#endif + +#endif + +#define GS(m0, m1, c0, c1, a, b, c, d) do { \ + a = SPH_T32(a + b + (m0 ^ c1)); \ + d = SPH_ROTR32(d ^ a, 16); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 12); \ + a = SPH_T32(a + b + (m1 ^ c0)); \ + d = SPH_ROTR32(d ^ a, 8); \ + c = SPH_T32(c + d); \ + b = SPH_ROTR32(b ^ c, 7); \ + } while (0) + +#if SPH_COMPACT_BLAKE_32 + +#define ROUND_S(r) do { \ + GS(M[sigma[r][0x0]], M[sigma[r][0x1]], \ + CS[sigma[r][0x0]], CS[sigma[r][0x1]], V0, V4, V8, VC); \ + GS(M[sigma[r][0x2]], M[sigma[r][0x3]], \ + CS[sigma[r][0x2]], CS[sigma[r][0x3]], V1, V5, V9, VD); \ + GS(M[sigma[r][0x4]], M[sigma[r][0x5]], \ + CS[sigma[r][0x4]], CS[sigma[r][0x5]], V2, V6, VA, VE); \ + GS(M[sigma[r][0x6]], M[sigma[r][0x7]], \ + CS[sigma[r][0x6]], CS[sigma[r][0x7]], V3, V7, VB, VF); \ + GS(M[sigma[r][0x8]], M[sigma[r][0x9]], \ + CS[sigma[r][0x8]], CS[sigma[r][0x9]], V0, V5, VA, VF); \ + GS(M[sigma[r][0xA]], M[sigma[r][0xB]], \ + CS[sigma[r][0xA]], CS[sigma[r][0xB]], V1, V6, VB, VC); \ + GS(M[sigma[r][0xC]], M[sigma[r][0xD]], \ + CS[sigma[r][0xC]], CS[sigma[r][0xD]], V2, V7, V8, VD); \ + GS(M[sigma[r][0xE]], M[sigma[r][0xF]], \ + CS[sigma[r][0xE]], CS[sigma[r][0xF]], V3, V4, V9, VE); \ + } while (0) + +#else + +#define ROUND_S(r) do { \ + GS(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ + GS(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ + GS(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ + GS(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ + GS(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ + GS(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ + GS(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ + GS(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ + } while (0) + +#endif + +#if SPH_64 + +#define GB(m0, m1, c0, c1, a, b, c, d) do { \ + a = SPH_T64(a + b + (m0 ^ c1)); \ + d = SPH_ROTR64(d ^ a, 32); \ + c = SPH_T64(c + d); \ + b = SPH_ROTR64(b ^ c, 25); \ + a = SPH_T64(a + b + (m1 ^ c0)); \ + d = SPH_ROTR64(d ^ a, 16); \ + c = SPH_T64(c + d); \ + b = SPH_ROTR64(b ^ c, 11); \ + } while (0) + +#if SPH_COMPACT_BLAKE_64 + +#define ROUND_B(r) do { \ + GB(M[sigma[r][0x0]], M[sigma[r][0x1]], \ + CB[sigma[r][0x0]], CB[sigma[r][0x1]], V0, V4, V8, VC); \ + GB(M[sigma[r][0x2]], M[sigma[r][0x3]], \ + CB[sigma[r][0x2]], CB[sigma[r][0x3]], V1, V5, V9, VD); \ + GB(M[sigma[r][0x4]], M[sigma[r][0x5]], \ + CB[sigma[r][0x4]], CB[sigma[r][0x5]], V2, V6, VA, VE); \ + GB(M[sigma[r][0x6]], M[sigma[r][0x7]], \ + CB[sigma[r][0x6]], CB[sigma[r][0x7]], V3, V7, VB, VF); \ + GB(M[sigma[r][0x8]], M[sigma[r][0x9]], \ + CB[sigma[r][0x8]], CB[sigma[r][0x9]], V0, V5, VA, VF); \ + GB(M[sigma[r][0xA]], M[sigma[r][0xB]], \ + CB[sigma[r][0xA]], CB[sigma[r][0xB]], V1, V6, VB, VC); \ + GB(M[sigma[r][0xC]], M[sigma[r][0xD]], \ + CB[sigma[r][0xC]], CB[sigma[r][0xD]], V2, V7, V8, VD); \ + GB(M[sigma[r][0xE]], M[sigma[r][0xF]], \ + CB[sigma[r][0xE]], CB[sigma[r][0xF]], V3, V4, V9, VE); \ + } while (0) + +#else + +#define ROUND_B(r) do { \ + GB(Mx(r, 0), Mx(r, 1), CBx(r, 0), CBx(r, 1), V0, V4, V8, VC); \ + GB(Mx(r, 2), Mx(r, 3), CBx(r, 2), CBx(r, 3), V1, V5, V9, VD); \ + GB(Mx(r, 4), Mx(r, 5), CBx(r, 4), CBx(r, 5), V2, V6, VA, VE); \ + GB(Mx(r, 6), Mx(r, 7), CBx(r, 6), CBx(r, 7), V3, V7, VB, VF); \ + GB(Mx(r, 8), Mx(r, 9), CBx(r, 8), CBx(r, 9), V0, V5, VA, VF); \ + GB(Mx(r, A), Mx(r, B), CBx(r, A), CBx(r, B), V1, V6, VB, VC); \ + GB(Mx(r, C), Mx(r, D), CBx(r, C), CBx(r, D), V2, V7, V8, VD); \ + GB(Mx(r, E), Mx(r, F), CBx(r, E), CBx(r, F), V3, V4, V9, VE); \ + } while (0) + +#endif + +#endif + +#define DECL_STATE32 \ + sph_u32 H0, H1, H2, H3, H4, H5, H6, H7; \ + sph_u32 S0, S1, S2, S3, T0, T1; + +#define READ_STATE32(state) do { \ + H0 = (state)->H[0]; \ + H1 = (state)->H[1]; \ + H2 = (state)->H[2]; \ + H3 = (state)->H[3]; \ + H4 = (state)->H[4]; \ + H5 = (state)->H[5]; \ + H6 = (state)->H[6]; \ + H7 = (state)->H[7]; \ + S0 = (state)->S[0]; \ + S1 = (state)->S[1]; \ + S2 = (state)->S[2]; \ + S3 = (state)->S[3]; \ + T0 = (state)->T0; \ + T1 = (state)->T1; \ + } while (0) + +#define WRITE_STATE32(state) do { \ + (state)->H[0] = H0; \ + (state)->H[1] = H1; \ + (state)->H[2] = H2; \ + (state)->H[3] = H3; \ + (state)->H[4] = H4; \ + (state)->H[5] = H5; \ + (state)->H[6] = H6; \ + (state)->H[7] = H7; \ + (state)->S[0] = S0; \ + (state)->S[1] = S1; \ + (state)->S[2] = S2; \ + (state)->S[3] = S3; \ + (state)->T0 = T0; \ + (state)->T1 = T1; \ + } while (0) + +#if SPH_COMPACT_BLAKE_32 + +#define COMPRESS32 do { \ + sph_u32 M[16]; \ + sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ + unsigned r; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CS0; \ + V9 = S1 ^ CS1; \ + VA = S2 ^ CS2; \ + VB = S3 ^ CS3; \ + VC = T0 ^ CS4; \ + VD = T0 ^ CS5; \ + VE = T1 ^ CS6; \ + VF = T1 ^ CS7; \ + M[0x0] = sph_dec32be_aligned(buf + 0); \ + M[0x1] = sph_dec32be_aligned(buf + 4); \ + M[0x2] = sph_dec32be_aligned(buf + 8); \ + M[0x3] = sph_dec32be_aligned(buf + 12); \ + M[0x4] = sph_dec32be_aligned(buf + 16); \ + M[0x5] = sph_dec32be_aligned(buf + 20); \ + M[0x6] = sph_dec32be_aligned(buf + 24); \ + M[0x7] = sph_dec32be_aligned(buf + 28); \ + M[0x8] = sph_dec32be_aligned(buf + 32); \ + M[0x9] = sph_dec32be_aligned(buf + 36); \ + M[0xA] = sph_dec32be_aligned(buf + 40); \ + M[0xB] = sph_dec32be_aligned(buf + 44); \ + M[0xC] = sph_dec32be_aligned(buf + 48); \ + M[0xD] = sph_dec32be_aligned(buf + 52); \ + M[0xE] = sph_dec32be_aligned(buf + 56); \ + M[0xF] = sph_dec32be_aligned(buf + 60); \ + for (r = 0; r < 8; r ++) \ + ROUND_S(r); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + +#else + +#define COMPRESS32 do { \ + sph_u32 M0, M1, M2, M3, M4, M5, M6, M7; \ + sph_u32 M8, M9, MA, MB, MC, MD, ME, MF; \ + sph_u32 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u32 V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CS0; \ + V9 = S1 ^ CS1; \ + VA = S2 ^ CS2; \ + VB = S3 ^ CS3; \ + VC = T0 ^ CS4; \ + VD = T0 ^ CS5; \ + VE = T1 ^ CS6; \ + VF = T1 ^ CS7; \ + M0 = sph_dec32be_aligned(buf + 0); \ + M1 = sph_dec32be_aligned(buf + 4); \ + M2 = sph_dec32be_aligned(buf + 8); \ + M3 = sph_dec32be_aligned(buf + 12); \ + M4 = sph_dec32be_aligned(buf + 16); \ + M5 = sph_dec32be_aligned(buf + 20); \ + M6 = sph_dec32be_aligned(buf + 24); \ + M7 = sph_dec32be_aligned(buf + 28); \ + M8 = sph_dec32be_aligned(buf + 32); \ + M9 = sph_dec32be_aligned(buf + 36); \ + MA = sph_dec32be_aligned(buf + 40); \ + MB = sph_dec32be_aligned(buf + 44); \ + MC = sph_dec32be_aligned(buf + 48); \ + MD = sph_dec32be_aligned(buf + 52); \ + ME = sph_dec32be_aligned(buf + 56); \ + MF = sph_dec32be_aligned(buf + 60); \ + ROUND_S(0); \ + ROUND_S(1); \ + ROUND_S(2); \ + ROUND_S(3); \ + ROUND_S(4); \ + ROUND_S(5); \ + ROUND_S(6); \ + ROUND_S(7); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + +#endif + +#if SPH_64 + +#define DECL_STATE64 \ + sph_u64 H0, H1, H2, H3, H4, H5, H6, H7; \ + sph_u64 S0, S1, S2, S3, T0, T1; + +#define READ_STATE64(state) do { \ + H0 = (state)->H[0]; \ + H1 = (state)->H[1]; \ + H2 = (state)->H[2]; \ + H3 = (state)->H[3]; \ + H4 = (state)->H[4]; \ + H5 = (state)->H[5]; \ + H6 = (state)->H[6]; \ + H7 = (state)->H[7]; \ + S0 = (state)->S[0]; \ + S1 = (state)->S[1]; \ + S2 = (state)->S[2]; \ + S3 = (state)->S[3]; \ + T0 = (state)->T0; \ + T1 = (state)->T1; \ + } while (0) + +#define WRITE_STATE64(state) do { \ + (state)->H[0] = H0; \ + (state)->H[1] = H1; \ + (state)->H[2] = H2; \ + (state)->H[3] = H3; \ + (state)->H[4] = H4; \ + (state)->H[5] = H5; \ + (state)->H[6] = H6; \ + (state)->H[7] = H7; \ + (state)->S[0] = S0; \ + (state)->S[1] = S1; \ + (state)->S[2] = S2; \ + (state)->S[3] = S3; \ + (state)->T0 = T0; \ + (state)->T1 = T1; \ + } while (0) + +#if SPH_COMPACT_BLAKE_64 + +#define COMPRESS64 do { \ + sph_u64 M[16]; \ + sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \ + unsigned r; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CB0; \ + V9 = S1 ^ CB1; \ + VA = S2 ^ CB2; \ + VB = S3 ^ CB3; \ + VC = T0 ^ CB4; \ + VD = T0 ^ CB5; \ + VE = T1 ^ CB6; \ + VF = T1 ^ CB7; \ + M[0x0] = sph_dec64be_aligned(buf + 0); \ + M[0x1] = sph_dec64be_aligned(buf + 8); \ + M[0x2] = sph_dec64be_aligned(buf + 16); \ + M[0x3] = sph_dec64be_aligned(buf + 24); \ + M[0x4] = sph_dec64be_aligned(buf + 32); \ + M[0x5] = sph_dec64be_aligned(buf + 40); \ + M[0x6] = sph_dec64be_aligned(buf + 48); \ + M[0x7] = sph_dec64be_aligned(buf + 56); \ + M[0x8] = sph_dec64be_aligned(buf + 64); \ + M[0x9] = sph_dec64be_aligned(buf + 72); \ + M[0xA] = sph_dec64be_aligned(buf + 80); \ + M[0xB] = sph_dec64be_aligned(buf + 88); \ + M[0xC] = sph_dec64be_aligned(buf + 96); \ + M[0xD] = sph_dec64be_aligned(buf + 104); \ + M[0xE] = sph_dec64be_aligned(buf + 112); \ + M[0xF] = sph_dec64be_aligned(buf + 120); \ + for (r = 0; r < 16; r ++) \ + ROUND_B(r); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + +#else + +#define COMPRESS64 do { \ + sph_u64 M0, M1, M2, M3, M4, M5, M6, M7; \ + sph_u64 M8, M9, MA, MB, MC, MD, ME, MF; \ + sph_u64 V0, V1, V2, V3, V4, V5, V6, V7; \ + sph_u64 V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = S0 ^ CB0; \ + V9 = S1 ^ CB1; \ + VA = S2 ^ CB2; \ + VB = S3 ^ CB3; \ + VC = T0 ^ CB4; \ + VD = T0 ^ CB5; \ + VE = T1 ^ CB6; \ + VF = T1 ^ CB7; \ + M0 = sph_dec64be_aligned(buf + 0); \ + M1 = sph_dec64be_aligned(buf + 8); \ + M2 = sph_dec64be_aligned(buf + 16); \ + M3 = sph_dec64be_aligned(buf + 24); \ + M4 = sph_dec64be_aligned(buf + 32); \ + M5 = sph_dec64be_aligned(buf + 40); \ + M6 = sph_dec64be_aligned(buf + 48); \ + M7 = sph_dec64be_aligned(buf + 56); \ + M8 = sph_dec64be_aligned(buf + 64); \ + M9 = sph_dec64be_aligned(buf + 72); \ + MA = sph_dec64be_aligned(buf + 80); \ + MB = sph_dec64be_aligned(buf + 88); \ + MC = sph_dec64be_aligned(buf + 96); \ + MD = sph_dec64be_aligned(buf + 104); \ + ME = sph_dec64be_aligned(buf + 112); \ + MF = sph_dec64be_aligned(buf + 120); \ + ROUND_B(0); \ + ROUND_B(1); \ + ROUND_B(2); \ + ROUND_B(3); \ + ROUND_B(4); \ + ROUND_B(5); \ + ROUND_B(6); \ + ROUND_B(7); \ + ROUND_B(8); \ + ROUND_B(9); \ + ROUND_B(0); \ + ROUND_B(1); \ + ROUND_B(2); \ + ROUND_B(3); \ + ROUND_B(4); \ + ROUND_B(5); \ + H0 ^= S0 ^ V0 ^ V8; \ + H1 ^= S1 ^ V1 ^ V9; \ + H2 ^= S2 ^ V2 ^ VA; \ + H3 ^= S3 ^ V3 ^ VB; \ + H4 ^= S0 ^ V4 ^ VC; \ + H5 ^= S1 ^ V5 ^ VD; \ + H6 ^= S2 ^ V6 ^ VE; \ + H7 ^= S3 ^ V7 ^ VF; \ + } while (0) + +#endif + +#endif + +static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 }; + +static void +blake32_init(sph_blake_small_context *sc, + const sph_u32 *iv, const sph_u32 *salt) +{ + memcpy(sc->H, iv, 8 * sizeof(sph_u32)); + memcpy(sc->S, salt, 4 * sizeof(sph_u32)); + sc->T0 = sc->T1 = 0; + sc->ptr = 0; +} + +static void +blake32(sph_blake_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE32 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE32(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((T0 = SPH_T32(T0 + 512)) < 512) + T1 = SPH_T32(T1 + 1); + COMPRESS32; + ptr = 0; + } + } + WRITE_STATE32(sc); + sc->ptr = ptr; +} + +static void +blake32_close(sph_blake_small_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w32) +{ + union { + unsigned char buf[64]; + sph_u32 dummy; + } u; + size_t ptr, k; + unsigned bit_len; + unsigned z; + sph_u32 th, tl; + unsigned char *out; + + ptr = sc->ptr; + bit_len = ((unsigned)ptr << 3) + n; + z = 0x80 >> n; + u.buf[ptr] = ((ub & -z) | z) & 0xFF; + tl = sc->T0 + bit_len; + th = sc->T1; + if (ptr == 0 && n == 0) { + sc->T0 = SPH_C32(0xFFFFFE00); + sc->T1 = SPH_C32(0xFFFFFFFF); + } else if (sc->T0 == 0) { + sc->T0 = SPH_C32(0xFFFFFE00) + bit_len; + sc->T1 = SPH_T32(sc->T1 - 1); + } else { + sc->T0 -= 512 - bit_len; + } + if (bit_len <= 446) { + memset(u.buf + ptr + 1, 0, 55 - ptr); + if (out_size_w32 == 8) + u.buf[55] |= 1; + sph_enc32be_aligned(u.buf + 56, th); + sph_enc32be_aligned(u.buf + 60, tl); + blake32(sc, u.buf + ptr, 64 - ptr); + } else { + memset(u.buf + ptr + 1, 0, 63 - ptr); + blake32(sc, u.buf + ptr, 64 - ptr); + sc->T0 = SPH_C32(0xFFFFFE00); + sc->T1 = SPH_C32(0xFFFFFFFF); + memset(u.buf, 0, 56); + if (out_size_w32 == 8) + u.buf[55] = 1; + sph_enc32be_aligned(u.buf + 56, th); + sph_enc32be_aligned(u.buf + 60, tl); + blake32(sc, u.buf, 64); + } + out = dst; + for (k = 0; k < out_size_w32; k ++) + sph_enc32be(out + (k << 2), sc->H[k]); +} + +#if SPH_64 + +static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 }; + +static void +blake64_init(sph_blake_big_context *sc, + const sph_u64 *iv, const sph_u64 *salt) +{ + memcpy(sc->H, iv, 8 * sizeof(sph_u64)); + memcpy(sc->S, salt, 4 * sizeof(sph_u64)); + sc->T0 = sc->T1 = 0; + sc->ptr = 0; +} + +static void +blake64(sph_blake_big_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE64 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE64(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((T0 = SPH_T64(T0 + 1024)) < 1024) + T1 = SPH_T64(T1 + 1); + COMPRESS64; + ptr = 0; + } + } + WRITE_STATE64(sc); + sc->ptr = ptr; +} + +static void +blake64_close(sph_blake_big_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w64) +{ + union { + unsigned char buf[128]; + sph_u64 dummy; + } u; + size_t ptr, k; + unsigned bit_len; + unsigned z; + sph_u64 th, tl; + unsigned char *out; + + ptr = sc->ptr; + bit_len = ((unsigned)ptr << 3) + n; + z = 0x80 >> n; + u.buf[ptr] = ((ub & -z) | z) & 0xFF; + tl = sc->T0 + bit_len; + th = sc->T1; + if (ptr == 0 && n == 0) { + sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00); + sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF); + } else if (sc->T0 == 0) { + sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00) + bit_len; + sc->T1 = SPH_T64(sc->T1 - 1); + } else { + sc->T0 -= 1024 - bit_len; + } + if (bit_len <= 894) { + memset(u.buf + ptr + 1, 0, 111 - ptr); + if (out_size_w64 == 8) + u.buf[111] |= 1; + sph_enc64be_aligned(u.buf + 112, th); + sph_enc64be_aligned(u.buf + 120, tl); + blake64(sc, u.buf + ptr, 128 - ptr); + } else { + memset(u.buf + ptr + 1, 0, 127 - ptr); + blake64(sc, u.buf + ptr, 128 - ptr); + sc->T0 = SPH_C64(0xFFFFFFFFFFFFFC00); + sc->T1 = SPH_C64(0xFFFFFFFFFFFFFFFF); + memset(u.buf, 0, 112); + if (out_size_w64 == 8) + u.buf[111] = 1; + sph_enc64be_aligned(u.buf + 112, th); + sph_enc64be_aligned(u.buf + 120, tl); + blake64(sc, u.buf, 128); + } + out = dst; + for (k = 0; k < out_size_w64; k ++) + sph_enc64be(out + (k << 3), sc->H[k]); +} + +#endif + +/* see sph_blake.h */ +void +sph_blake224_init(void *cc) +{ + blake32_init(cc, IV224, salt_zero_small); +} + +/* see sph_blake.h */ +void +sph_blake224(void *cc, const void *data, size_t len) +{ + blake32(cc, data, len); +} + +/* see sph_blake.h */ +void +sph_blake224_close(void *cc, void *dst) +{ + sph_blake224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_blake.h */ +void +sph_blake224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + blake32_close(cc, ub, n, dst, 7); + sph_blake224_init(cc); +} + +/* see sph_blake.h */ +void +sph_blake256_init(void *cc) +{ + blake32_init(cc, IV256, salt_zero_small); +} + +/* see sph_blake.h */ +void +sph_blake256(void *cc, const void *data, size_t len) +{ + blake32(cc, data, len); +} + +/* see sph_blake.h */ +void +sph_blake256_close(void *cc, void *dst) +{ + sph_blake256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_blake.h */ +void +sph_blake256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + blake32_close(cc, ub, n, dst, 8); + sph_blake256_init(cc); +} + +#if SPH_64 + +/* see sph_blake.h */ +void +sph_blake384_init(void *cc) +{ + blake64_init(cc, IV384, salt_zero_big); +} + +/* see sph_blake.h */ +void +sph_blake384(void *cc, const void *data, size_t len) +{ + blake64(cc, data, len); +} + +/* see sph_blake.h */ +void +sph_blake384_close(void *cc, void *dst) +{ + sph_blake384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_blake.h */ +void +sph_blake384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + blake64_close(cc, ub, n, dst, 6); + sph_blake384_init(cc); +} + +/* see sph_blake.h */ +void +sph_blake512_init(void *cc) +{ + blake64_init(cc, IV512, salt_zero_big); +} + +/* see sph_blake.h */ +void +sph_blake512(void *cc, const void *data, size_t len) +{ + blake64(cc, data, len); +} + +/* see sph_blake.h */ +void +sph_blake512_close(void *cc, void *dst) +{ + sph_blake512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_blake.h */ +void +sph_blake512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + blake64_close(cc, ub, n, dst, 8); + sph_blake512_init(cc); +} + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/sha3/sph_blake.h b/sha3/sph_blake.h new file mode 100644 index 00000000..d8d79439 --- /dev/null +++ b/sha3/sph_blake.h @@ -0,0 +1,327 @@ +/* $Id: sph_blake.h 252 2011-06-07 17:55:14Z tp $ */ +/** + * BLAKE interface. BLAKE is a family of functions which differ by their + * output size; this implementation defines BLAKE for output sizes 224, + * 256, 384 and 512 bits. This implementation conforms to the "third + * round" specification. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_blake.h + * @author Thomas Pornin + */ + +#ifndef SPH_BLAKE_H__ +#define SPH_BLAKE_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for BLAKE-224. + */ +#define SPH_SIZE_blake224 224 + +/** + * Output size (in bits) for BLAKE-256. + */ +#define SPH_SIZE_blake256 256 + +#if SPH_64 + +/** + * Output size (in bits) for BLAKE-384. + */ +#define SPH_SIZE_blake384 384 + +/** + * Output size (in bits) for BLAKE-512. + */ +#define SPH_SIZE_blake512 512 + +#endif + +/** + * This structure is a context for BLAKE-224 and BLAKE-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a BLAKE computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running BLAKE + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u32 H[8]; + sph_u32 S[4]; + sph_u32 T0, T1; +#endif +} sph_blake_small_context; + +/** + * This structure is a context for BLAKE-224 computations. It is + * identical to the common sph_blake_small_context. + */ +typedef sph_blake_small_context sph_blake224_context; + +/** + * This structure is a context for BLAKE-256 computations. It is + * identical to the common sph_blake_small_context. + */ +typedef sph_blake_small_context sph_blake256_context; + +#if SPH_64 + +/** + * This structure is a context for BLAKE-384 and BLAKE-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a BLAKE computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running BLAKE + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + sph_u64 H[8]; + sph_u64 S[4]; + sph_u64 T0, T1; +#endif +} sph_blake_big_context; + +/** + * This structure is a context for BLAKE-384 computations. It is + * identical to the common sph_blake_small_context. + */ +typedef sph_blake_big_context sph_blake384_context; + +/** + * This structure is a context for BLAKE-512 computations. It is + * identical to the common sph_blake_small_context. + */ +typedef sph_blake_big_context sph_blake512_context; + +#endif + +/** + * Initialize a BLAKE-224 context. This process performs no memory allocation. + * + * @param cc the BLAKE-224 context (pointer to a + * sph_blake224_context) + */ +void sph_blake224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BLAKE-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_blake224(void *cc, const void *data, size_t len); + +/** + * Terminate the current BLAKE-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the BLAKE-224 context + * @param dst the destination buffer + */ +void sph_blake224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BLAKE-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_blake224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a BLAKE-256 context. This process performs no memory allocation. + * + * @param cc the BLAKE-256 context (pointer to a + * sph_blake256_context) + */ +void sph_blake256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BLAKE-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_blake256(void *cc, const void *data, size_t len); + +/** + * Terminate the current BLAKE-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the BLAKE-256 context + * @param dst the destination buffer + */ +void sph_blake256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BLAKE-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_blake256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#if SPH_64 + +/** + * Initialize a BLAKE-384 context. This process performs no memory allocation. + * + * @param cc the BLAKE-384 context (pointer to a + * sph_blake384_context) + */ +void sph_blake384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BLAKE-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_blake384(void *cc, const void *data, size_t len); + +/** + * Terminate the current BLAKE-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the BLAKE-384 context + * @param dst the destination buffer + */ +void sph_blake384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BLAKE-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_blake384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a BLAKE-512 context. This process performs no memory allocation. + * + * @param cc the BLAKE-512 context (pointer to a + * sph_blake512_context) + */ +void sph_blake512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BLAKE-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_blake512(void *cc, const void *data, size_t len); + +/** + * Terminate the current BLAKE-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the BLAKE-512 context + * @param dst the destination buffer + */ +void sph_blake512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BLAKE-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_blake512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_bmw.c b/sha3/sph_bmw.c new file mode 100644 index 00000000..b89a881e --- /dev/null +++ b/sha3/sph_bmw.c @@ -0,0 +1,965 @@ +/* $Id: bmw.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * BMW implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#ifdef __cplusplus +extern "C"{ +#endif + +#include "sph_bmw.h" + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_BMW +#define SPH_SMALL_FOOTPRINT_BMW 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static const sph_u32 IV224[] = { + SPH_C32(0x00010203), SPH_C32(0x04050607), + SPH_C32(0x08090A0B), SPH_C32(0x0C0D0E0F), + SPH_C32(0x10111213), SPH_C32(0x14151617), + SPH_C32(0x18191A1B), SPH_C32(0x1C1D1E1F), + SPH_C32(0x20212223), SPH_C32(0x24252627), + SPH_C32(0x28292A2B), SPH_C32(0x2C2D2E2F), + SPH_C32(0x30313233), SPH_C32(0x34353637), + SPH_C32(0x38393A3B), SPH_C32(0x3C3D3E3F) +}; + +static const sph_u32 IV256[] = { + SPH_C32(0x40414243), SPH_C32(0x44454647), + SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F), + SPH_C32(0x50515253), SPH_C32(0x54555657), + SPH_C32(0x58595A5B), SPH_C32(0x5C5D5E5F), + SPH_C32(0x60616263), SPH_C32(0x64656667), + SPH_C32(0x68696A6B), SPH_C32(0x6C6D6E6F), + SPH_C32(0x70717273), SPH_C32(0x74757677), + SPH_C32(0x78797A7B), SPH_C32(0x7C7D7E7F) +}; + +#if SPH_64 + +static const sph_u64 IV384[] = { + SPH_C64(0x0001020304050607), SPH_C64(0x08090A0B0C0D0E0F), + SPH_C64(0x1011121314151617), SPH_C64(0x18191A1B1C1D1E1F), + SPH_C64(0x2021222324252627), SPH_C64(0x28292A2B2C2D2E2F), + SPH_C64(0x3031323334353637), SPH_C64(0x38393A3B3C3D3E3F), + SPH_C64(0x4041424344454647), SPH_C64(0x48494A4B4C4D4E4F), + SPH_C64(0x5051525354555657), SPH_C64(0x58595A5B5C5D5E5F), + SPH_C64(0x6061626364656667), SPH_C64(0x68696A6B6C6D6E6F), + SPH_C64(0x7071727374757677), SPH_C64(0x78797A7B7C7D7E7F) +}; + +static const sph_u64 IV512[] = { + SPH_C64(0x8081828384858687), SPH_C64(0x88898A8B8C8D8E8F), + SPH_C64(0x9091929394959697), SPH_C64(0x98999A9B9C9D9E9F), + SPH_C64(0xA0A1A2A3A4A5A6A7), SPH_C64(0xA8A9AAABACADAEAF), + SPH_C64(0xB0B1B2B3B4B5B6B7), SPH_C64(0xB8B9BABBBCBDBEBF), + SPH_C64(0xC0C1C2C3C4C5C6C7), SPH_C64(0xC8C9CACBCCCDCECF), + SPH_C64(0xD0D1D2D3D4D5D6D7), SPH_C64(0xD8D9DADBDCDDDEDF), + SPH_C64(0xE0E1E2E3E4E5E6E7), SPH_C64(0xE8E9EAEBECEDEEEF), + SPH_C64(0xF0F1F2F3F4F5F6F7), SPH_C64(0xF8F9FAFBFCFDFEFF) +}; + +#endif + +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +#define LPAR ( + +#define I16_16 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +#define I16_17 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 +#define I16_18 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 +#define I16_19 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 +#define I16_20 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 +#define I16_21 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20 +#define I16_22 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 +#define I16_23 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22 +#define I16_24 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 +#define I16_25 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24 +#define I16_26 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 +#define I16_27 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 +#define I16_28 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 +#define I16_29 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 +#define I16_30 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 +#define I16_31 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 + +#define M16_16 0, 1, 3, 4, 7, 10, 11 +#define M16_17 1, 2, 4, 5, 8, 11, 12 +#define M16_18 2, 3, 5, 6, 9, 12, 13 +#define M16_19 3, 4, 6, 7, 10, 13, 14 +#define M16_20 4, 5, 7, 8, 11, 14, 15 +#define M16_21 5, 6, 8, 9, 12, 15, 16 +#define M16_22 6, 7, 9, 10, 13, 0, 1 +#define M16_23 7, 8, 10, 11, 14, 1, 2 +#define M16_24 8, 9, 11, 12, 15, 2, 3 +#define M16_25 9, 10, 12, 13, 0, 3, 4 +#define M16_26 10, 11, 13, 14, 1, 4, 5 +#define M16_27 11, 12, 14, 15, 2, 5, 6 +#define M16_28 12, 13, 15, 16, 3, 6, 7 +#define M16_29 13, 14, 0, 1, 4, 7, 8 +#define M16_30 14, 15, 1, 2, 5, 8, 9 +#define M16_31 15, 16, 2, 3, 6, 9, 10 + +#define ss0(x) (((x) >> 1) ^ SPH_T32((x) << 3) \ + ^ SPH_ROTL32(x, 4) ^ SPH_ROTL32(x, 19)) +#define ss1(x) (((x) >> 1) ^ SPH_T32((x) << 2) \ + ^ SPH_ROTL32(x, 8) ^ SPH_ROTL32(x, 23)) +#define ss2(x) (((x) >> 2) ^ SPH_T32((x) << 1) \ + ^ SPH_ROTL32(x, 12) ^ SPH_ROTL32(x, 25)) +#define ss3(x) (((x) >> 2) ^ SPH_T32((x) << 2) \ + ^ SPH_ROTL32(x, 15) ^ SPH_ROTL32(x, 29)) +#define ss4(x) (((x) >> 1) ^ (x)) +#define ss5(x) (((x) >> 2) ^ (x)) +#define rs1(x) SPH_ROTL32(x, 3) +#define rs2(x) SPH_ROTL32(x, 7) +#define rs3(x) SPH_ROTL32(x, 13) +#define rs4(x) SPH_ROTL32(x, 16) +#define rs5(x) SPH_ROTL32(x, 19) +#define rs6(x) SPH_ROTL32(x, 23) +#define rs7(x) SPH_ROTL32(x, 27) + +#define Ks(j) SPH_T32((sph_u32)(j) * SPH_C32(0x05555555)) + +#define add_elt_s(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \ + (SPH_T32(SPH_ROTL32(mf(j0m), j1m) + SPH_ROTL32(mf(j3m), j4m) \ + - SPH_ROTL32(mf(j10m), j11m) + Ks(j16)) ^ hf(j7m)) + +#define expand1s_inner(qf, mf, hf, i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, \ + i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + SPH_T32(ss1(qf(i0)) + ss2(qf(i1)) + ss3(qf(i2)) + ss0(qf(i3)) \ + + ss1(qf(i4)) + ss2(qf(i5)) + ss3(qf(i6)) + ss0(qf(i7)) \ + + ss1(qf(i8)) + ss2(qf(i9)) + ss3(qf(i10)) + ss0(qf(i11)) \ + + ss1(qf(i12)) + ss2(qf(i13)) + ss3(qf(i14)) + ss0(qf(i15)) \ + + add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) + +#define expand1s(qf, mf, hf, i16) \ + expand1s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) +#define expand1s_(qf, mf, hf, i16, ix, iy) \ + expand1s_inner LPAR qf, mf, hf, i16, ix, iy) + +#define expand2s_inner(qf, mf, hf, i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, \ + i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + SPH_T32(qf(i0) + rs1(qf(i1)) + qf(i2) + rs2(qf(i3)) \ + + qf(i4) + rs3(qf(i5)) + qf(i6) + rs4(qf(i7)) \ + + qf(i8) + rs5(qf(i9)) + qf(i10) + rs6(qf(i11)) \ + + qf(i12) + rs7(qf(i13)) + ss4(qf(i14)) + ss5(qf(i15)) \ + + add_elt_s(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) + +#define expand2s(qf, mf, hf, i16) \ + expand2s_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) +#define expand2s_(qf, mf, hf, i16, ix, iy) \ + expand2s_inner LPAR qf, mf, hf, i16, ix, iy) + +#if SPH_64 + +#define sb0(x) (((x) >> 1) ^ SPH_T64((x) << 3) \ + ^ SPH_ROTL64(x, 4) ^ SPH_ROTL64(x, 37)) +#define sb1(x) (((x) >> 1) ^ SPH_T64((x) << 2) \ + ^ SPH_ROTL64(x, 13) ^ SPH_ROTL64(x, 43)) +#define sb2(x) (((x) >> 2) ^ SPH_T64((x) << 1) \ + ^ SPH_ROTL64(x, 19) ^ SPH_ROTL64(x, 53)) +#define sb3(x) (((x) >> 2) ^ SPH_T64((x) << 2) \ + ^ SPH_ROTL64(x, 28) ^ SPH_ROTL64(x, 59)) +#define sb4(x) (((x) >> 1) ^ (x)) +#define sb5(x) (((x) >> 2) ^ (x)) +#define rb1(x) SPH_ROTL64(x, 5) +#define rb2(x) SPH_ROTL64(x, 11) +#define rb3(x) SPH_ROTL64(x, 27) +#define rb4(x) SPH_ROTL64(x, 32) +#define rb5(x) SPH_ROTL64(x, 37) +#define rb6(x) SPH_ROTL64(x, 43) +#define rb7(x) SPH_ROTL64(x, 53) + +#define Kb(j) SPH_T64((sph_u64)(j) * SPH_C64(0x0555555555555555)) + +#if SPH_SMALL_FOOTPRINT_BMW + +static const sph_u64 Kb_tab[] = { + Kb(16), Kb(17), Kb(18), Kb(19), Kb(20), Kb(21), Kb(22), Kb(23), + Kb(24), Kb(25), Kb(26), Kb(27), Kb(28), Kb(29), Kb(30), Kb(31) +}; + +#define rol_off(mf, j, off) \ + SPH_ROTL64(mf(((j) + (off)) & 15), (((j) + (off)) & 15) + 1) + +#define add_elt_b(mf, hf, j) \ + (SPH_T64(rol_off(mf, j, 0) + rol_off(mf, j, 3) \ + - rol_off(mf, j, 10) + Kb_tab[j]) ^ hf(((j) + 7) & 15)) + +#define expand1b(qf, mf, hf, i) \ + SPH_T64(sb1(qf((i) - 16)) + sb2(qf((i) - 15)) \ + + sb3(qf((i) - 14)) + sb0(qf((i) - 13)) \ + + sb1(qf((i) - 12)) + sb2(qf((i) - 11)) \ + + sb3(qf((i) - 10)) + sb0(qf((i) - 9)) \ + + sb1(qf((i) - 8)) + sb2(qf((i) - 7)) \ + + sb3(qf((i) - 6)) + sb0(qf((i) - 5)) \ + + sb1(qf((i) - 4)) + sb2(qf((i) - 3)) \ + + sb3(qf((i) - 2)) + sb0(qf((i) - 1)) \ + + add_elt_b(mf, hf, (i) - 16)) + +#define expand2b(qf, mf, hf, i) \ + SPH_T64(qf((i) - 16) + rb1(qf((i) - 15)) \ + + qf((i) - 14) + rb2(qf((i) - 13)) \ + + qf((i) - 12) + rb3(qf((i) - 11)) \ + + qf((i) - 10) + rb4(qf((i) - 9)) \ + + qf((i) - 8) + rb5(qf((i) - 7)) \ + + qf((i) - 6) + rb6(qf((i) - 5)) \ + + qf((i) - 4) + rb7(qf((i) - 3)) \ + + sb4(qf((i) - 2)) + sb5(qf((i) - 1)) \ + + add_elt_b(mf, hf, (i) - 16)) + +#else + +#define add_elt_b(mf, hf, j0m, j1m, j3m, j4m, j7m, j10m, j11m, j16) \ + (SPH_T64(SPH_ROTL64(mf(j0m), j1m) + SPH_ROTL64(mf(j3m), j4m) \ + - SPH_ROTL64(mf(j10m), j11m) + Kb(j16)) ^ hf(j7m)) + +#define expand1b_inner(qf, mf, hf, i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, \ + i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + SPH_T64(sb1(qf(i0)) + sb2(qf(i1)) + sb3(qf(i2)) + sb0(qf(i3)) \ + + sb1(qf(i4)) + sb2(qf(i5)) + sb3(qf(i6)) + sb0(qf(i7)) \ + + sb1(qf(i8)) + sb2(qf(i9)) + sb3(qf(i10)) + sb0(qf(i11)) \ + + sb1(qf(i12)) + sb2(qf(i13)) + sb3(qf(i14)) + sb0(qf(i15)) \ + + add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) + +#define expand1b(qf, mf, hf, i16) \ + expand1b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) +#define expand1b_(qf, mf, hf, i16, ix, iy) \ + expand1b_inner LPAR qf, mf, hf, i16, ix, iy) + +#define expand2b_inner(qf, mf, hf, i16, \ + i0, i1, i2, i3, i4, i5, i6, i7, i8, \ + i9, i10, i11, i12, i13, i14, i15, \ + i0m, i1m, i3m, i4m, i7m, i10m, i11m) \ + SPH_T64(qf(i0) + rb1(qf(i1)) + qf(i2) + rb2(qf(i3)) \ + + qf(i4) + rb3(qf(i5)) + qf(i6) + rb4(qf(i7)) \ + + qf(i8) + rb5(qf(i9)) + qf(i10) + rb6(qf(i11)) \ + + qf(i12) + rb7(qf(i13)) + sb4(qf(i14)) + sb5(qf(i15)) \ + + add_elt_b(mf, hf, i0m, i1m, i3m, i4m, i7m, i10m, i11m, i16)) + +#define expand2b(qf, mf, hf, i16) \ + expand2b_(qf, mf, hf, i16, I16_ ## i16, M16_ ## i16) +#define expand2b_(qf, mf, hf, i16, ix, iy) \ + expand2b_inner LPAR qf, mf, hf, i16, ix, iy) + +#endif + +#endif + +#define MAKE_W(tt, i0, op01, i1, op12, i2, op23, i3, op34, i4) \ + tt((M(i0) ^ H(i0)) op01 (M(i1) ^ H(i1)) op12 (M(i2) ^ H(i2)) \ + op23 (M(i3) ^ H(i3)) op34 (M(i4) ^ H(i4))) + +#define Ws0 MAKE_W(SPH_T32, 5, -, 7, +, 10, +, 13, +, 14) +#define Ws1 MAKE_W(SPH_T32, 6, -, 8, +, 11, +, 14, -, 15) +#define Ws2 MAKE_W(SPH_T32, 0, +, 7, +, 9, -, 12, +, 15) +#define Ws3 MAKE_W(SPH_T32, 0, -, 1, +, 8, -, 10, +, 13) +#define Ws4 MAKE_W(SPH_T32, 1, +, 2, +, 9, -, 11, -, 14) +#define Ws5 MAKE_W(SPH_T32, 3, -, 2, +, 10, -, 12, +, 15) +#define Ws6 MAKE_W(SPH_T32, 4, -, 0, -, 3, -, 11, +, 13) +#define Ws7 MAKE_W(SPH_T32, 1, -, 4, -, 5, -, 12, -, 14) +#define Ws8 MAKE_W(SPH_T32, 2, -, 5, -, 6, +, 13, -, 15) +#define Ws9 MAKE_W(SPH_T32, 0, -, 3, +, 6, -, 7, +, 14) +#define Ws10 MAKE_W(SPH_T32, 8, -, 1, -, 4, -, 7, +, 15) +#define Ws11 MAKE_W(SPH_T32, 8, -, 0, -, 2, -, 5, +, 9) +#define Ws12 MAKE_W(SPH_T32, 1, +, 3, -, 6, -, 9, +, 10) +#define Ws13 MAKE_W(SPH_T32, 2, +, 4, +, 7, +, 10, +, 11) +#define Ws14 MAKE_W(SPH_T32, 3, -, 5, +, 8, -, 11, -, 12) +#define Ws15 MAKE_W(SPH_T32, 12, -, 4, -, 6, -, 9, +, 13) + +#if SPH_SMALL_FOOTPRINT_BMW + +#define MAKE_Qas do { \ + unsigned u; \ + sph_u32 Ws[16]; \ + Ws[ 0] = Ws0; \ + Ws[ 1] = Ws1; \ + Ws[ 2] = Ws2; \ + Ws[ 3] = Ws3; \ + Ws[ 4] = Ws4; \ + Ws[ 5] = Ws5; \ + Ws[ 6] = Ws6; \ + Ws[ 7] = Ws7; \ + Ws[ 8] = Ws8; \ + Ws[ 9] = Ws9; \ + Ws[10] = Ws10; \ + Ws[11] = Ws11; \ + Ws[12] = Ws12; \ + Ws[13] = Ws13; \ + Ws[14] = Ws14; \ + Ws[15] = Ws15; \ + for (u = 0; u < 15; u += 5) { \ + qt[u + 0] = SPH_T32(ss0(Ws[u + 0]) + H(u + 1)); \ + qt[u + 1] = SPH_T32(ss1(Ws[u + 1]) + H(u + 2)); \ + qt[u + 2] = SPH_T32(ss2(Ws[u + 2]) + H(u + 3)); \ + qt[u + 3] = SPH_T32(ss3(Ws[u + 3]) + H(u + 4)); \ + qt[u + 4] = SPH_T32(ss4(Ws[u + 4]) + H(u + 5)); \ + } \ + qt[15] = SPH_T32(ss0(Ws[15]) + H(0)); \ + } while (0) + +#define MAKE_Qbs do { \ + qt[16] = expand1s(Qs, M, H, 16); \ + qt[17] = expand1s(Qs, M, H, 17); \ + qt[18] = expand2s(Qs, M, H, 18); \ + qt[19] = expand2s(Qs, M, H, 19); \ + qt[20] = expand2s(Qs, M, H, 20); \ + qt[21] = expand2s(Qs, M, H, 21); \ + qt[22] = expand2s(Qs, M, H, 22); \ + qt[23] = expand2s(Qs, M, H, 23); \ + qt[24] = expand2s(Qs, M, H, 24); \ + qt[25] = expand2s(Qs, M, H, 25); \ + qt[26] = expand2s(Qs, M, H, 26); \ + qt[27] = expand2s(Qs, M, H, 27); \ + qt[28] = expand2s(Qs, M, H, 28); \ + qt[29] = expand2s(Qs, M, H, 29); \ + qt[30] = expand2s(Qs, M, H, 30); \ + qt[31] = expand2s(Qs, M, H, 31); \ + } while (0) + +#else + +#define MAKE_Qas do { \ + qt[ 0] = SPH_T32(ss0(Ws0 ) + H( 1)); \ + qt[ 1] = SPH_T32(ss1(Ws1 ) + H( 2)); \ + qt[ 2] = SPH_T32(ss2(Ws2 ) + H( 3)); \ + qt[ 3] = SPH_T32(ss3(Ws3 ) + H( 4)); \ + qt[ 4] = SPH_T32(ss4(Ws4 ) + H( 5)); \ + qt[ 5] = SPH_T32(ss0(Ws5 ) + H( 6)); \ + qt[ 6] = SPH_T32(ss1(Ws6 ) + H( 7)); \ + qt[ 7] = SPH_T32(ss2(Ws7 ) + H( 8)); \ + qt[ 8] = SPH_T32(ss3(Ws8 ) + H( 9)); \ + qt[ 9] = SPH_T32(ss4(Ws9 ) + H(10)); \ + qt[10] = SPH_T32(ss0(Ws10) + H(11)); \ + qt[11] = SPH_T32(ss1(Ws11) + H(12)); \ + qt[12] = SPH_T32(ss2(Ws12) + H(13)); \ + qt[13] = SPH_T32(ss3(Ws13) + H(14)); \ + qt[14] = SPH_T32(ss4(Ws14) + H(15)); \ + qt[15] = SPH_T32(ss0(Ws15) + H( 0)); \ + } while (0) + +#define MAKE_Qbs do { \ + qt[16] = expand1s(Qs, M, H, 16); \ + qt[17] = expand1s(Qs, M, H, 17); \ + qt[18] = expand2s(Qs, M, H, 18); \ + qt[19] = expand2s(Qs, M, H, 19); \ + qt[20] = expand2s(Qs, M, H, 20); \ + qt[21] = expand2s(Qs, M, H, 21); \ + qt[22] = expand2s(Qs, M, H, 22); \ + qt[23] = expand2s(Qs, M, H, 23); \ + qt[24] = expand2s(Qs, M, H, 24); \ + qt[25] = expand2s(Qs, M, H, 25); \ + qt[26] = expand2s(Qs, M, H, 26); \ + qt[27] = expand2s(Qs, M, H, 27); \ + qt[28] = expand2s(Qs, M, H, 28); \ + qt[29] = expand2s(Qs, M, H, 29); \ + qt[30] = expand2s(Qs, M, H, 30); \ + qt[31] = expand2s(Qs, M, H, 31); \ + } while (0) + +#endif + +#define MAKE_Qs do { \ + MAKE_Qas; \ + MAKE_Qbs; \ + } while (0) + +#define Qs(j) (qt[j]) + +#if SPH_64 + +#define Wb0 MAKE_W(SPH_T64, 5, -, 7, +, 10, +, 13, +, 14) +#define Wb1 MAKE_W(SPH_T64, 6, -, 8, +, 11, +, 14, -, 15) +#define Wb2 MAKE_W(SPH_T64, 0, +, 7, +, 9, -, 12, +, 15) +#define Wb3 MAKE_W(SPH_T64, 0, -, 1, +, 8, -, 10, +, 13) +#define Wb4 MAKE_W(SPH_T64, 1, +, 2, +, 9, -, 11, -, 14) +#define Wb5 MAKE_W(SPH_T64, 3, -, 2, +, 10, -, 12, +, 15) +#define Wb6 MAKE_W(SPH_T64, 4, -, 0, -, 3, -, 11, +, 13) +#define Wb7 MAKE_W(SPH_T64, 1, -, 4, -, 5, -, 12, -, 14) +#define Wb8 MAKE_W(SPH_T64, 2, -, 5, -, 6, +, 13, -, 15) +#define Wb9 MAKE_W(SPH_T64, 0, -, 3, +, 6, -, 7, +, 14) +#define Wb10 MAKE_W(SPH_T64, 8, -, 1, -, 4, -, 7, +, 15) +#define Wb11 MAKE_W(SPH_T64, 8, -, 0, -, 2, -, 5, +, 9) +#define Wb12 MAKE_W(SPH_T64, 1, +, 3, -, 6, -, 9, +, 10) +#define Wb13 MAKE_W(SPH_T64, 2, +, 4, +, 7, +, 10, +, 11) +#define Wb14 MAKE_W(SPH_T64, 3, -, 5, +, 8, -, 11, -, 12) +#define Wb15 MAKE_W(SPH_T64, 12, -, 4, -, 6, -, 9, +, 13) + +#if SPH_SMALL_FOOTPRINT_BMW + +#define MAKE_Qab do { \ + unsigned u; \ + sph_u64 Wb[16]; \ + Wb[ 0] = Wb0; \ + Wb[ 1] = Wb1; \ + Wb[ 2] = Wb2; \ + Wb[ 3] = Wb3; \ + Wb[ 4] = Wb4; \ + Wb[ 5] = Wb5; \ + Wb[ 6] = Wb6; \ + Wb[ 7] = Wb7; \ + Wb[ 8] = Wb8; \ + Wb[ 9] = Wb9; \ + Wb[10] = Wb10; \ + Wb[11] = Wb11; \ + Wb[12] = Wb12; \ + Wb[13] = Wb13; \ + Wb[14] = Wb14; \ + Wb[15] = Wb15; \ + for (u = 0; u < 15; u += 5) { \ + qt[u + 0] = SPH_T64(sb0(Wb[u + 0]) + H(u + 1)); \ + qt[u + 1] = SPH_T64(sb1(Wb[u + 1]) + H(u + 2)); \ + qt[u + 2] = SPH_T64(sb2(Wb[u + 2]) + H(u + 3)); \ + qt[u + 3] = SPH_T64(sb3(Wb[u + 3]) + H(u + 4)); \ + qt[u + 4] = SPH_T64(sb4(Wb[u + 4]) + H(u + 5)); \ + } \ + qt[15] = SPH_T64(sb0(Wb[15]) + H(0)); \ + } while (0) + +#define MAKE_Qbb do { \ + unsigned u; \ + for (u = 16; u < 18; u ++) \ + qt[u] = expand1b(Qb, M, H, u); \ + for (u = 18; u < 32; u ++) \ + qt[u] = expand2b(Qb, M, H, u); \ + } while (0) + +#else + +#define MAKE_Qab do { \ + qt[ 0] = SPH_T64(sb0(Wb0 ) + H( 1)); \ + qt[ 1] = SPH_T64(sb1(Wb1 ) + H( 2)); \ + qt[ 2] = SPH_T64(sb2(Wb2 ) + H( 3)); \ + qt[ 3] = SPH_T64(sb3(Wb3 ) + H( 4)); \ + qt[ 4] = SPH_T64(sb4(Wb4 ) + H( 5)); \ + qt[ 5] = SPH_T64(sb0(Wb5 ) + H( 6)); \ + qt[ 6] = SPH_T64(sb1(Wb6 ) + H( 7)); \ + qt[ 7] = SPH_T64(sb2(Wb7 ) + H( 8)); \ + qt[ 8] = SPH_T64(sb3(Wb8 ) + H( 9)); \ + qt[ 9] = SPH_T64(sb4(Wb9 ) + H(10)); \ + qt[10] = SPH_T64(sb0(Wb10) + H(11)); \ + qt[11] = SPH_T64(sb1(Wb11) + H(12)); \ + qt[12] = SPH_T64(sb2(Wb12) + H(13)); \ + qt[13] = SPH_T64(sb3(Wb13) + H(14)); \ + qt[14] = SPH_T64(sb4(Wb14) + H(15)); \ + qt[15] = SPH_T64(sb0(Wb15) + H( 0)); \ + } while (0) + +#define MAKE_Qbb do { \ + qt[16] = expand1b(Qb, M, H, 16); \ + qt[17] = expand1b(Qb, M, H, 17); \ + qt[18] = expand2b(Qb, M, H, 18); \ + qt[19] = expand2b(Qb, M, H, 19); \ + qt[20] = expand2b(Qb, M, H, 20); \ + qt[21] = expand2b(Qb, M, H, 21); \ + qt[22] = expand2b(Qb, M, H, 22); \ + qt[23] = expand2b(Qb, M, H, 23); \ + qt[24] = expand2b(Qb, M, H, 24); \ + qt[25] = expand2b(Qb, M, H, 25); \ + qt[26] = expand2b(Qb, M, H, 26); \ + qt[27] = expand2b(Qb, M, H, 27); \ + qt[28] = expand2b(Qb, M, H, 28); \ + qt[29] = expand2b(Qb, M, H, 29); \ + qt[30] = expand2b(Qb, M, H, 30); \ + qt[31] = expand2b(Qb, M, H, 31); \ + } while (0) + +#endif + +#define MAKE_Qb do { \ + MAKE_Qab; \ + MAKE_Qbb; \ + } while (0) + +#define Qb(j) (qt[j]) + +#endif + +#define FOLD(type, mkQ, tt, rol, mf, qf, dhf) do { \ + type qt[32], xl, xh; \ + mkQ; \ + xl = qf(16) ^ qf(17) ^ qf(18) ^ qf(19) \ + ^ qf(20) ^ qf(21) ^ qf(22) ^ qf(23); \ + xh = xl ^ qf(24) ^ qf(25) ^ qf(26) ^ qf(27) \ + ^ qf(28) ^ qf(29) ^ qf(30) ^ qf(31); \ + dhf( 0) = tt(((xh << 5) ^ (qf(16) >> 5) ^ mf( 0)) \ + + (xl ^ qf(24) ^ qf( 0))); \ + dhf( 1) = tt(((xh >> 7) ^ (qf(17) << 8) ^ mf( 1)) \ + + (xl ^ qf(25) ^ qf( 1))); \ + dhf( 2) = tt(((xh >> 5) ^ (qf(18) << 5) ^ mf( 2)) \ + + (xl ^ qf(26) ^ qf( 2))); \ + dhf( 3) = tt(((xh >> 1) ^ (qf(19) << 5) ^ mf( 3)) \ + + (xl ^ qf(27) ^ qf( 3))); \ + dhf( 4) = tt(((xh >> 3) ^ (qf(20) << 0) ^ mf( 4)) \ + + (xl ^ qf(28) ^ qf( 4))); \ + dhf( 5) = tt(((xh << 6) ^ (qf(21) >> 6) ^ mf( 5)) \ + + (xl ^ qf(29) ^ qf( 5))); \ + dhf( 6) = tt(((xh >> 4) ^ (qf(22) << 6) ^ mf( 6)) \ + + (xl ^ qf(30) ^ qf( 6))); \ + dhf( 7) = tt(((xh >> 11) ^ (qf(23) << 2) ^ mf( 7)) \ + + (xl ^ qf(31) ^ qf( 7))); \ + dhf( 8) = tt(rol(dhf(4), 9) + (xh ^ qf(24) ^ mf( 8)) \ + + ((xl << 8) ^ qf(23) ^ qf( 8))); \ + dhf( 9) = tt(rol(dhf(5), 10) + (xh ^ qf(25) ^ mf( 9)) \ + + ((xl >> 6) ^ qf(16) ^ qf( 9))); \ + dhf(10) = tt(rol(dhf(6), 11) + (xh ^ qf(26) ^ mf(10)) \ + + ((xl << 6) ^ qf(17) ^ qf(10))); \ + dhf(11) = tt(rol(dhf(7), 12) + (xh ^ qf(27) ^ mf(11)) \ + + ((xl << 4) ^ qf(18) ^ qf(11))); \ + dhf(12) = tt(rol(dhf(0), 13) + (xh ^ qf(28) ^ mf(12)) \ + + ((xl >> 3) ^ qf(19) ^ qf(12))); \ + dhf(13) = tt(rol(dhf(1), 14) + (xh ^ qf(29) ^ mf(13)) \ + + ((xl >> 4) ^ qf(20) ^ qf(13))); \ + dhf(14) = tt(rol(dhf(2), 15) + (xh ^ qf(30) ^ mf(14)) \ + + ((xl >> 7) ^ qf(21) ^ qf(14))); \ + dhf(15) = tt(rol(dhf(3), 16) + (xh ^ qf(31) ^ mf(15)) \ + + ((xl >> 2) ^ qf(22) ^ qf(15))); \ + } while (0) + +#define FOLDs FOLD(sph_u32, MAKE_Qs, SPH_T32, SPH_ROTL32, M, Qs, dH) + +#if SPH_64 + +#define FOLDb FOLD(sph_u64, MAKE_Qb, SPH_T64, SPH_ROTL64, M, Qb, dH) + +#endif + +static void +compress_small(const unsigned char *data, const sph_u32 h[16], sph_u32 dh[16]) +{ +#if SPH_LITTLE_FAST +#define M(x) sph_dec32le_aligned(data + 4 * (x)) +#else + sph_u32 mv[16]; + + mv[ 0] = sph_dec32le_aligned(data + 0); + mv[ 1] = sph_dec32le_aligned(data + 4); + mv[ 2] = sph_dec32le_aligned(data + 8); + mv[ 3] = sph_dec32le_aligned(data + 12); + mv[ 4] = sph_dec32le_aligned(data + 16); + mv[ 5] = sph_dec32le_aligned(data + 20); + mv[ 6] = sph_dec32le_aligned(data + 24); + mv[ 7] = sph_dec32le_aligned(data + 28); + mv[ 8] = sph_dec32le_aligned(data + 32); + mv[ 9] = sph_dec32le_aligned(data + 36); + mv[10] = sph_dec32le_aligned(data + 40); + mv[11] = sph_dec32le_aligned(data + 44); + mv[12] = sph_dec32le_aligned(data + 48); + mv[13] = sph_dec32le_aligned(data + 52); + mv[14] = sph_dec32le_aligned(data + 56); + mv[15] = sph_dec32le_aligned(data + 60); +#define M(x) (mv[x]) +#endif +#define H(x) (h[x]) +#define dH(x) (dh[x]) + + FOLDs; + +#undef M +#undef H +#undef dH +} + +static const sph_u32 final_s[16] = { + SPH_C32(0xaaaaaaa0), SPH_C32(0xaaaaaaa1), SPH_C32(0xaaaaaaa2), + SPH_C32(0xaaaaaaa3), SPH_C32(0xaaaaaaa4), SPH_C32(0xaaaaaaa5), + SPH_C32(0xaaaaaaa6), SPH_C32(0xaaaaaaa7), SPH_C32(0xaaaaaaa8), + SPH_C32(0xaaaaaaa9), SPH_C32(0xaaaaaaaa), SPH_C32(0xaaaaaaab), + SPH_C32(0xaaaaaaac), SPH_C32(0xaaaaaaad), SPH_C32(0xaaaaaaae), + SPH_C32(0xaaaaaaaf) +}; + +static void +bmw32_init(sph_bmw_small_context *sc, const sph_u32 *iv) +{ + memcpy(sc->H, iv, sizeof sc->H); + sc->ptr = 0; +#if SPH_64 + sc->bit_count = 0; +#else + sc->bit_count_high = 0; + sc->bit_count_low = 0; +#endif +} + +static void +bmw32(sph_bmw_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + sph_u32 htmp[16]; + sph_u32 *h1, *h2; +#if !SPH_64 + sph_u32 tmp; +#endif + +#if SPH_64 + sc->bit_count += (sph_u64)len << 3; +#else + tmp = sc->bit_count_low; + sc->bit_count_low = SPH_T32(tmp + ((sph_u32)len << 3)); + if (sc->bit_count_low < tmp) + sc->bit_count_high ++; + sc->bit_count_high += len >> 29; +#endif + buf = sc->buf; + ptr = sc->ptr; + h1 = sc->H; + h2 = htmp; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + ptr += clen; + if (ptr == sizeof sc->buf) { + sph_u32 *ht; + + compress_small(buf, h1, h2); + ht = h1; + h1 = h2; + h2 = ht; + ptr = 0; + } + } + sc->ptr = ptr; + if (h1 != sc->H) + memcpy(sc->H, h1, sizeof sc->H); +} + +static void +bmw32_close(sph_bmw_small_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32) +{ + unsigned char *buf, *out; + size_t ptr, u, v; + unsigned z; + sph_u32 h1[16], h2[16], *h; + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + h = sc->H; + if (ptr > (sizeof sc->buf) - 8) { + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + compress_small(buf, h, h1); + ptr = 0; + h = h1; + } + memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr); +#if SPH_64 + sph_enc64le_aligned(buf + (sizeof sc->buf) - 8, + SPH_T64(sc->bit_count + n)); +#else + sph_enc32le_aligned(buf + (sizeof sc->buf) - 8, + sc->bit_count_low + n); + sph_enc32le_aligned(buf + (sizeof sc->buf) - 4, + SPH_T32(sc->bit_count_high)); +#endif + compress_small(buf, h, h2); + for (u = 0; u < 16; u ++) + sph_enc32le_aligned(buf + 4 * u, h2[u]); + compress_small(buf, final_s, h1); + out = dst; + for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++) + sph_enc32le(out + 4 * u, h1[v]); +} + +#if SPH_64 + +static void +compress_big(const unsigned char *data, const sph_u64 h[16], sph_u64 dh[16]) +{ +#if SPH_LITTLE_FAST +#define M(x) sph_dec64le_aligned(data + 8 * (x)) +#else + sph_u64 mv[16]; + + mv[ 0] = sph_dec64le_aligned(data + 0); + mv[ 1] = sph_dec64le_aligned(data + 8); + mv[ 2] = sph_dec64le_aligned(data + 16); + mv[ 3] = sph_dec64le_aligned(data + 24); + mv[ 4] = sph_dec64le_aligned(data + 32); + mv[ 5] = sph_dec64le_aligned(data + 40); + mv[ 6] = sph_dec64le_aligned(data + 48); + mv[ 7] = sph_dec64le_aligned(data + 56); + mv[ 8] = sph_dec64le_aligned(data + 64); + mv[ 9] = sph_dec64le_aligned(data + 72); + mv[10] = sph_dec64le_aligned(data + 80); + mv[11] = sph_dec64le_aligned(data + 88); + mv[12] = sph_dec64le_aligned(data + 96); + mv[13] = sph_dec64le_aligned(data + 104); + mv[14] = sph_dec64le_aligned(data + 112); + mv[15] = sph_dec64le_aligned(data + 120); +#define M(x) (mv[x]) +#endif +#define H(x) (h[x]) +#define dH(x) (dh[x]) + + FOLDb; + +#undef M +#undef H +#undef dH +} + +static const sph_u64 final_b[16] = { + SPH_C64(0xaaaaaaaaaaaaaaa0), SPH_C64(0xaaaaaaaaaaaaaaa1), + SPH_C64(0xaaaaaaaaaaaaaaa2), SPH_C64(0xaaaaaaaaaaaaaaa3), + SPH_C64(0xaaaaaaaaaaaaaaa4), SPH_C64(0xaaaaaaaaaaaaaaa5), + SPH_C64(0xaaaaaaaaaaaaaaa6), SPH_C64(0xaaaaaaaaaaaaaaa7), + SPH_C64(0xaaaaaaaaaaaaaaa8), SPH_C64(0xaaaaaaaaaaaaaaa9), + SPH_C64(0xaaaaaaaaaaaaaaaa), SPH_C64(0xaaaaaaaaaaaaaaab), + SPH_C64(0xaaaaaaaaaaaaaaac), SPH_C64(0xaaaaaaaaaaaaaaad), + SPH_C64(0xaaaaaaaaaaaaaaae), SPH_C64(0xaaaaaaaaaaaaaaaf) +}; + +static void +bmw64_init(sph_bmw_big_context *sc, const sph_u64 *iv) +{ + memcpy(sc->H, iv, sizeof sc->H); + sc->ptr = 0; + sc->bit_count = 0; +} + +static void +bmw64(sph_bmw_big_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + sph_u64 htmp[16]; + sph_u64 *h1, *h2; + + sc->bit_count += (sph_u64)len << 3; + buf = sc->buf; + ptr = sc->ptr; + h1 = sc->H; + h2 = htmp; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + ptr += clen; + if (ptr == sizeof sc->buf) { + sph_u64 *ht; + + compress_big(buf, h1, h2); + ht = h1; + h1 = h2; + h2 = ht; + ptr = 0; + } + } + sc->ptr = ptr; + if (h1 != sc->H) + memcpy(sc->H, h1, sizeof sc->H); +} + +static void +bmw64_close(sph_bmw_big_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w64) +{ + unsigned char *buf, *out; + size_t ptr, u, v; + unsigned z; + sph_u64 h1[16], h2[16], *h; + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + h = sc->H; + if (ptr > (sizeof sc->buf) - 8) { + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + compress_big(buf, h, h1); + ptr = 0; + h = h1; + } + memset(buf + ptr, 0, (sizeof sc->buf) - 8 - ptr); + sph_enc64le_aligned(buf + (sizeof sc->buf) - 8, + SPH_T64(sc->bit_count + n)); + compress_big(buf, h, h2); + for (u = 0; u < 16; u ++) + sph_enc64le_aligned(buf + 8 * u, h2[u]); + compress_big(buf, final_b, h1); + out = dst; + for (u = 0, v = 16 - out_size_w64; u < out_size_w64; u ++, v ++) + sph_enc64le(out + 8 * u, h1[v]); +} + +#endif + +/* see sph_bmw.h */ +void +sph_bmw224_init(void *cc) +{ + bmw32_init(cc, IV224); +} + +/* see sph_bmw.h */ +void +sph_bmw224(void *cc, const void *data, size_t len) +{ + bmw32(cc, data, len); +} + +/* see sph_bmw.h */ +void +sph_bmw224_close(void *cc, void *dst) +{ + sph_bmw224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_bmw.h */ +void +sph_bmw224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + bmw32_close(cc, ub, n, dst, 7); + sph_bmw224_init(cc); +} + +/* see sph_bmw.h */ +void +sph_bmw256_init(void *cc) +{ + bmw32_init(cc, IV256); +} + +/* see sph_bmw.h */ +void +sph_bmw256(void *cc, const void *data, size_t len) +{ + bmw32(cc, data, len); +} + +/* see sph_bmw.h */ +void +sph_bmw256_close(void *cc, void *dst) +{ + sph_bmw256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_bmw.h */ +void +sph_bmw256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + bmw32_close(cc, ub, n, dst, 8); + sph_bmw256_init(cc); +} + +#if SPH_64 + +/* see sph_bmw.h */ +void +sph_bmw384_init(void *cc) +{ + bmw64_init(cc, IV384); +} + +/* see sph_bmw.h */ +void +sph_bmw384(void *cc, const void *data, size_t len) +{ + bmw64(cc, data, len); +} + +/* see sph_bmw.h */ +void +sph_bmw384_close(void *cc, void *dst) +{ + sph_bmw384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_bmw.h */ +void +sph_bmw384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + bmw64_close(cc, ub, n, dst, 6); + sph_bmw384_init(cc); +} + +/* see sph_bmw.h */ +void +sph_bmw512_init(void *cc) +{ + bmw64_init(cc, IV512); +} + +/* see sph_bmw.h */ +void +sph_bmw512(void *cc, const void *data, size_t len) +{ + bmw64(cc, data, len); +} + +/* see sph_bmw.h */ +void +sph_bmw512_close(void *cc, void *dst) +{ + sph_bmw512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_bmw.h */ +void +sph_bmw512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + bmw64_close(cc, ub, n, dst, 8); + sph_bmw512_init(cc); +} + +#endif + +#ifdef __cplusplus +} +#endif diff --git a/sha3/sph_bmw.h b/sha3/sph_bmw.h new file mode 100644 index 00000000..d386b0c1 --- /dev/null +++ b/sha3/sph_bmw.h @@ -0,0 +1,328 @@ +/* $Id: sph_bmw.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * BMW interface. BMW (aka "Blue Midnight Wish") is a family of + * functions which differ by their output size; this implementation + * defines BMW for output sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_bmw.h + * @author Thomas Pornin + */ + +#ifndef SPH_BMW_H__ +#define SPH_BMW_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for BMW-224. + */ +#define SPH_SIZE_bmw224 224 + +/** + * Output size (in bits) for BMW-256. + */ +#define SPH_SIZE_bmw256 256 + +#if SPH_64 + +/** + * Output size (in bits) for BMW-384. + */ +#define SPH_SIZE_bmw384 384 + +/** + * Output size (in bits) for BMW-512. + */ +#define SPH_SIZE_bmw512 512 + +#endif + +/** + * This structure is a context for BMW-224 and BMW-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a BMW computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running BMW + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u32 H[16]; +#if SPH_64 + sph_u64 bit_count; +#else + sph_u32 bit_count_high, bit_count_low; +#endif +#endif +} sph_bmw_small_context; + +/** + * This structure is a context for BMW-224 computations. It is + * identical to the common sph_bmw_small_context. + */ +typedef sph_bmw_small_context sph_bmw224_context; + +/** + * This structure is a context for BMW-256 computations. It is + * identical to the common sph_bmw_small_context. + */ +typedef sph_bmw_small_context sph_bmw256_context; + +#if SPH_64 + +/** + * This structure is a context for BMW-384 and BMW-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a BMW computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running BMW + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + sph_u64 H[16]; + sph_u64 bit_count; +#endif +} sph_bmw_big_context; + +/** + * This structure is a context for BMW-384 computations. It is + * identical to the common sph_bmw_small_context. + */ +typedef sph_bmw_big_context sph_bmw384_context; + +/** + * This structure is a context for BMW-512 computations. It is + * identical to the common sph_bmw_small_context. + */ +typedef sph_bmw_big_context sph_bmw512_context; + +#endif + +/** + * Initialize a BMW-224 context. This process performs no memory allocation. + * + * @param cc the BMW-224 context (pointer to a + * sph_bmw224_context) + */ +void sph_bmw224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BMW-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_bmw224(void *cc, const void *data, size_t len); + +/** + * Terminate the current BMW-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the BMW-224 context + * @param dst the destination buffer + */ +void sph_bmw224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BMW-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_bmw224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a BMW-256 context. This process performs no memory allocation. + * + * @param cc the BMW-256 context (pointer to a + * sph_bmw256_context) + */ +void sph_bmw256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BMW-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_bmw256(void *cc, const void *data, size_t len); + +/** + * Terminate the current BMW-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the BMW-256 context + * @param dst the destination buffer + */ +void sph_bmw256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BMW-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_bmw256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#if SPH_64 + +/** + * Initialize a BMW-384 context. This process performs no memory allocation. + * + * @param cc the BMW-384 context (pointer to a + * sph_bmw384_context) + */ +void sph_bmw384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BMW-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_bmw384(void *cc, const void *data, size_t len); + +/** + * Terminate the current BMW-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the BMW-384 context + * @param dst the destination buffer + */ +void sph_bmw384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BMW-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_bmw384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a BMW-512 context. This process performs no memory allocation. + * + * @param cc the BMW-512 context (pointer to a + * sph_bmw512_context) + */ +void sph_bmw512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the BMW-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_bmw512(void *cc, const void *data, size_t len); + +/** + * Terminate the current BMW-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the BMW-512 context + * @param dst the destination buffer + */ +void sph_bmw512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the BMW-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_bmw512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_cubehash.c b/sha3/sph_cubehash.c new file mode 100644 index 00000000..9322fe14 --- /dev/null +++ b/sha3/sph_cubehash.c @@ -0,0 +1,723 @@ +/* $Id: cubehash.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * CubeHash implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_cubehash.h" +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_CUBEHASH +#define SPH_SMALL_FOOTPRINT_CUBEHASH 1 +#endif + +/* + * Some tests were conducted on an Intel Core2 Q6600 (32-bit and 64-bit + * mode), a PowerPC G3, and a MIPS-compatible CPU (Broadcom BCM3302). + * It appears that the optimal settings are: + * -- full unroll, no state copy on the "big" systems (x86, PowerPC) + * -- unroll to 4 or 8, state copy on the "small" system (MIPS) + */ + +#if SPH_SMALL_FOOTPRINT_CUBEHASH + +#if !defined SPH_CUBEHASH_UNROLL +#define SPH_CUBEHASH_UNROLL 4 +#endif +#if !defined SPH_CUBEHASH_NOCOPY +#define SPH_CUBEHASH_NOCOPY 1 +#endif + +#else + +#if !defined SPH_CUBEHASH_UNROLL +#define SPH_CUBEHASH_UNROLL 0 +#endif +#if !defined SPH_CUBEHASH_NOCOPY +#define SPH_CUBEHASH_NOCOPY 0 +#endif + +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static const sph_u32 IV224[] = { + SPH_C32(0xB0FC8217), SPH_C32(0x1BEE1A90), SPH_C32(0x829E1A22), + SPH_C32(0x6362C342), SPH_C32(0x24D91C30), SPH_C32(0x03A7AA24), + SPH_C32(0xA63721C8), SPH_C32(0x85B0E2EF), SPH_C32(0xF35D13F3), + SPH_C32(0x41DA807D), SPH_C32(0x21A70CA6), SPH_C32(0x1F4E9774), + SPH_C32(0xB3E1C932), SPH_C32(0xEB0A79A8), SPH_C32(0xCDDAAA66), + SPH_C32(0xE2F6ECAA), SPH_C32(0x0A713362), SPH_C32(0xAA3080E0), + SPH_C32(0xD8F23A32), SPH_C32(0xCEF15E28), SPH_C32(0xDB086314), + SPH_C32(0x7F709DF7), SPH_C32(0xACD228A4), SPH_C32(0x704D6ECE), + SPH_C32(0xAA3EC95F), SPH_C32(0xE387C214), SPH_C32(0x3A6445FF), + SPH_C32(0x9CAB81C3), SPH_C32(0xC73D4B98), SPH_C32(0xD277AEBE), + SPH_C32(0xFD20151C), SPH_C32(0x00CB573E) +}; + +static const sph_u32 IV256[] = { + SPH_C32(0xEA2BD4B4), SPH_C32(0xCCD6F29F), SPH_C32(0x63117E71), + SPH_C32(0x35481EAE), SPH_C32(0x22512D5B), SPH_C32(0xE5D94E63), + SPH_C32(0x7E624131), SPH_C32(0xF4CC12BE), SPH_C32(0xC2D0B696), + SPH_C32(0x42AF2070), SPH_C32(0xD0720C35), SPH_C32(0x3361DA8C), + SPH_C32(0x28CCECA4), SPH_C32(0x8EF8AD83), SPH_C32(0x4680AC00), + SPH_C32(0x40E5FBAB), SPH_C32(0xD89041C3), SPH_C32(0x6107FBD5), + SPH_C32(0x6C859D41), SPH_C32(0xF0B26679), SPH_C32(0x09392549), + SPH_C32(0x5FA25603), SPH_C32(0x65C892FD), SPH_C32(0x93CB6285), + SPH_C32(0x2AF2B5AE), SPH_C32(0x9E4B4E60), SPH_C32(0x774ABFDD), + SPH_C32(0x85254725), SPH_C32(0x15815AEB), SPH_C32(0x4AB6AAD6), + SPH_C32(0x9CDAF8AF), SPH_C32(0xD6032C0A) +}; + +static const sph_u32 IV384[] = { + SPH_C32(0xE623087E), SPH_C32(0x04C00C87), SPH_C32(0x5EF46453), + SPH_C32(0x69524B13), SPH_C32(0x1A05C7A9), SPH_C32(0x3528DF88), + SPH_C32(0x6BDD01B5), SPH_C32(0x5057B792), SPH_C32(0x6AA7A922), + SPH_C32(0x649C7EEE), SPH_C32(0xF426309F), SPH_C32(0xCB629052), + SPH_C32(0xFC8E20ED), SPH_C32(0xB3482BAB), SPH_C32(0xF89E5E7E), + SPH_C32(0xD83D4DE4), SPH_C32(0x44BFC10D), SPH_C32(0x5FC1E63D), + SPH_C32(0x2104E6CB), SPH_C32(0x17958F7F), SPH_C32(0xDBEAEF70), + SPH_C32(0xB4B97E1E), SPH_C32(0x32C195F6), SPH_C32(0x6184A8E4), + SPH_C32(0x796C2543), SPH_C32(0x23DE176D), SPH_C32(0xD33BBAEC), + SPH_C32(0x0C12E5D2), SPH_C32(0x4EB95A7B), SPH_C32(0x2D18BA01), + SPH_C32(0x04EE475F), SPH_C32(0x1FC5F22E) +}; + +static const sph_u32 IV512[] = { + SPH_C32(0x2AEA2A61), SPH_C32(0x50F494D4), SPH_C32(0x2D538B8B), + SPH_C32(0x4167D83E), SPH_C32(0x3FEE2313), SPH_C32(0xC701CF8C), + SPH_C32(0xCC39968E), SPH_C32(0x50AC5695), SPH_C32(0x4D42C787), + SPH_C32(0xA647A8B3), SPH_C32(0x97CF0BEF), SPH_C32(0x825B4537), + SPH_C32(0xEEF864D2), SPH_C32(0xF22090C4), SPH_C32(0xD0E5CD33), + SPH_C32(0xA23911AE), SPH_C32(0xFCD398D9), SPH_C32(0x148FE485), + SPH_C32(0x1B017BEF), SPH_C32(0xB6444532), SPH_C32(0x6A536159), + SPH_C32(0x2FF5781C), SPH_C32(0x91FA7934), SPH_C32(0x0DBADEA9), + SPH_C32(0xD65C8A2B), SPH_C32(0xA5A70E75), SPH_C32(0xB1C62456), + SPH_C32(0xBC796576), SPH_C32(0x1921C8F7), SPH_C32(0xE7989AF1), + SPH_C32(0x7795D246), SPH_C32(0xD43E3B44) +}; + +#define T32 SPH_T32 +#define ROTL32 SPH_ROTL32 + +#if SPH_CUBEHASH_NOCOPY + +#define DECL_STATE +#define READ_STATE(cc) +#define WRITE_STATE(cc) + +#define x0 ((sc)->state[ 0]) +#define x1 ((sc)->state[ 1]) +#define x2 ((sc)->state[ 2]) +#define x3 ((sc)->state[ 3]) +#define x4 ((sc)->state[ 4]) +#define x5 ((sc)->state[ 5]) +#define x6 ((sc)->state[ 6]) +#define x7 ((sc)->state[ 7]) +#define x8 ((sc)->state[ 8]) +#define x9 ((sc)->state[ 9]) +#define xa ((sc)->state[10]) +#define xb ((sc)->state[11]) +#define xc ((sc)->state[12]) +#define xd ((sc)->state[13]) +#define xe ((sc)->state[14]) +#define xf ((sc)->state[15]) +#define xg ((sc)->state[16]) +#define xh ((sc)->state[17]) +#define xi ((sc)->state[18]) +#define xj ((sc)->state[19]) +#define xk ((sc)->state[20]) +#define xl ((sc)->state[21]) +#define xm ((sc)->state[22]) +#define xn ((sc)->state[23]) +#define xo ((sc)->state[24]) +#define xp ((sc)->state[25]) +#define xq ((sc)->state[26]) +#define xr ((sc)->state[27]) +#define xs ((sc)->state[28]) +#define xt ((sc)->state[29]) +#define xu ((sc)->state[30]) +#define xv ((sc)->state[31]) + +#else + +#define DECL_STATE \ + sph_u32 x0, x1, x2, x3, x4, x5, x6, x7; \ + sph_u32 x8, x9, xa, xb, xc, xd, xe, xf; \ + sph_u32 xg, xh, xi, xj, xk, xl, xm, xn; \ + sph_u32 xo, xp, xq, xr, xs, xt, xu, xv; + +#define READ_STATE(cc) do { \ + x0 = (cc)->state[ 0]; \ + x1 = (cc)->state[ 1]; \ + x2 = (cc)->state[ 2]; \ + x3 = (cc)->state[ 3]; \ + x4 = (cc)->state[ 4]; \ + x5 = (cc)->state[ 5]; \ + x6 = (cc)->state[ 6]; \ + x7 = (cc)->state[ 7]; \ + x8 = (cc)->state[ 8]; \ + x9 = (cc)->state[ 9]; \ + xa = (cc)->state[10]; \ + xb = (cc)->state[11]; \ + xc = (cc)->state[12]; \ + xd = (cc)->state[13]; \ + xe = (cc)->state[14]; \ + xf = (cc)->state[15]; \ + xg = (cc)->state[16]; \ + xh = (cc)->state[17]; \ + xi = (cc)->state[18]; \ + xj = (cc)->state[19]; \ + xk = (cc)->state[20]; \ + xl = (cc)->state[21]; \ + xm = (cc)->state[22]; \ + xn = (cc)->state[23]; \ + xo = (cc)->state[24]; \ + xp = (cc)->state[25]; \ + xq = (cc)->state[26]; \ + xr = (cc)->state[27]; \ + xs = (cc)->state[28]; \ + xt = (cc)->state[29]; \ + xu = (cc)->state[30]; \ + xv = (cc)->state[31]; \ + } while (0) + +#define WRITE_STATE(cc) do { \ + (cc)->state[ 0] = x0; \ + (cc)->state[ 1] = x1; \ + (cc)->state[ 2] = x2; \ + (cc)->state[ 3] = x3; \ + (cc)->state[ 4] = x4; \ + (cc)->state[ 5] = x5; \ + (cc)->state[ 6] = x6; \ + (cc)->state[ 7] = x7; \ + (cc)->state[ 8] = x8; \ + (cc)->state[ 9] = x9; \ + (cc)->state[10] = xa; \ + (cc)->state[11] = xb; \ + (cc)->state[12] = xc; \ + (cc)->state[13] = xd; \ + (cc)->state[14] = xe; \ + (cc)->state[15] = xf; \ + (cc)->state[16] = xg; \ + (cc)->state[17] = xh; \ + (cc)->state[18] = xi; \ + (cc)->state[19] = xj; \ + (cc)->state[20] = xk; \ + (cc)->state[21] = xl; \ + (cc)->state[22] = xm; \ + (cc)->state[23] = xn; \ + (cc)->state[24] = xo; \ + (cc)->state[25] = xp; \ + (cc)->state[26] = xq; \ + (cc)->state[27] = xr; \ + (cc)->state[28] = xs; \ + (cc)->state[29] = xt; \ + (cc)->state[30] = xu; \ + (cc)->state[31] = xv; \ + } while (0) + +#endif + +#define INPUT_BLOCK do { \ + x0 ^= sph_dec32le_aligned(buf + 0); \ + x1 ^= sph_dec32le_aligned(buf + 4); \ + x2 ^= sph_dec32le_aligned(buf + 8); \ + x3 ^= sph_dec32le_aligned(buf + 12); \ + x4 ^= sph_dec32le_aligned(buf + 16); \ + x5 ^= sph_dec32le_aligned(buf + 20); \ + x6 ^= sph_dec32le_aligned(buf + 24); \ + x7 ^= sph_dec32le_aligned(buf + 28); \ + } while (0) + +#define ROUND_EVEN do { \ + xg = T32(x0 + xg); \ + x0 = ROTL32(x0, 7); \ + xh = T32(x1 + xh); \ + x1 = ROTL32(x1, 7); \ + xi = T32(x2 + xi); \ + x2 = ROTL32(x2, 7); \ + xj = T32(x3 + xj); \ + x3 = ROTL32(x3, 7); \ + xk = T32(x4 + xk); \ + x4 = ROTL32(x4, 7); \ + xl = T32(x5 + xl); \ + x5 = ROTL32(x5, 7); \ + xm = T32(x6 + xm); \ + x6 = ROTL32(x6, 7); \ + xn = T32(x7 + xn); \ + x7 = ROTL32(x7, 7); \ + xo = T32(x8 + xo); \ + x8 = ROTL32(x8, 7); \ + xp = T32(x9 + xp); \ + x9 = ROTL32(x9, 7); \ + xq = T32(xa + xq); \ + xa = ROTL32(xa, 7); \ + xr = T32(xb + xr); \ + xb = ROTL32(xb, 7); \ + xs = T32(xc + xs); \ + xc = ROTL32(xc, 7); \ + xt = T32(xd + xt); \ + xd = ROTL32(xd, 7); \ + xu = T32(xe + xu); \ + xe = ROTL32(xe, 7); \ + xv = T32(xf + xv); \ + xf = ROTL32(xf, 7); \ + x8 ^= xg; \ + x9 ^= xh; \ + xa ^= xi; \ + xb ^= xj; \ + xc ^= xk; \ + xd ^= xl; \ + xe ^= xm; \ + xf ^= xn; \ + x0 ^= xo; \ + x1 ^= xp; \ + x2 ^= xq; \ + x3 ^= xr; \ + x4 ^= xs; \ + x5 ^= xt; \ + x6 ^= xu; \ + x7 ^= xv; \ + xi = T32(x8 + xi); \ + x8 = ROTL32(x8, 11); \ + xj = T32(x9 + xj); \ + x9 = ROTL32(x9, 11); \ + xg = T32(xa + xg); \ + xa = ROTL32(xa, 11); \ + xh = T32(xb + xh); \ + xb = ROTL32(xb, 11); \ + xm = T32(xc + xm); \ + xc = ROTL32(xc, 11); \ + xn = T32(xd + xn); \ + xd = ROTL32(xd, 11); \ + xk = T32(xe + xk); \ + xe = ROTL32(xe, 11); \ + xl = T32(xf + xl); \ + xf = ROTL32(xf, 11); \ + xq = T32(x0 + xq); \ + x0 = ROTL32(x0, 11); \ + xr = T32(x1 + xr); \ + x1 = ROTL32(x1, 11); \ + xo = T32(x2 + xo); \ + x2 = ROTL32(x2, 11); \ + xp = T32(x3 + xp); \ + x3 = ROTL32(x3, 11); \ + xu = T32(x4 + xu); \ + x4 = ROTL32(x4, 11); \ + xv = T32(x5 + xv); \ + x5 = ROTL32(x5, 11); \ + xs = T32(x6 + xs); \ + x6 = ROTL32(x6, 11); \ + xt = T32(x7 + xt); \ + x7 = ROTL32(x7, 11); \ + xc ^= xi; \ + xd ^= xj; \ + xe ^= xg; \ + xf ^= xh; \ + x8 ^= xm; \ + x9 ^= xn; \ + xa ^= xk; \ + xb ^= xl; \ + x4 ^= xq; \ + x5 ^= xr; \ + x6 ^= xo; \ + x7 ^= xp; \ + x0 ^= xu; \ + x1 ^= xv; \ + x2 ^= xs; \ + x3 ^= xt; \ + } while (0) + +#define ROUND_ODD do { \ + xj = T32(xc + xj); \ + xc = ROTL32(xc, 7); \ + xi = T32(xd + xi); \ + xd = ROTL32(xd, 7); \ + xh = T32(xe + xh); \ + xe = ROTL32(xe, 7); \ + xg = T32(xf + xg); \ + xf = ROTL32(xf, 7); \ + xn = T32(x8 + xn); \ + x8 = ROTL32(x8, 7); \ + xm = T32(x9 + xm); \ + x9 = ROTL32(x9, 7); \ + xl = T32(xa + xl); \ + xa = ROTL32(xa, 7); \ + xk = T32(xb + xk); \ + xb = ROTL32(xb, 7); \ + xr = T32(x4 + xr); \ + x4 = ROTL32(x4, 7); \ + xq = T32(x5 + xq); \ + x5 = ROTL32(x5, 7); \ + xp = T32(x6 + xp); \ + x6 = ROTL32(x6, 7); \ + xo = T32(x7 + xo); \ + x7 = ROTL32(x7, 7); \ + xv = T32(x0 + xv); \ + x0 = ROTL32(x0, 7); \ + xu = T32(x1 + xu); \ + x1 = ROTL32(x1, 7); \ + xt = T32(x2 + xt); \ + x2 = ROTL32(x2, 7); \ + xs = T32(x3 + xs); \ + x3 = ROTL32(x3, 7); \ + x4 ^= xj; \ + x5 ^= xi; \ + x6 ^= xh; \ + x7 ^= xg; \ + x0 ^= xn; \ + x1 ^= xm; \ + x2 ^= xl; \ + x3 ^= xk; \ + xc ^= xr; \ + xd ^= xq; \ + xe ^= xp; \ + xf ^= xo; \ + x8 ^= xv; \ + x9 ^= xu; \ + xa ^= xt; \ + xb ^= xs; \ + xh = T32(x4 + xh); \ + x4 = ROTL32(x4, 11); \ + xg = T32(x5 + xg); \ + x5 = ROTL32(x5, 11); \ + xj = T32(x6 + xj); \ + x6 = ROTL32(x6, 11); \ + xi = T32(x7 + xi); \ + x7 = ROTL32(x7, 11); \ + xl = T32(x0 + xl); \ + x0 = ROTL32(x0, 11); \ + xk = T32(x1 + xk); \ + x1 = ROTL32(x1, 11); \ + xn = T32(x2 + xn); \ + x2 = ROTL32(x2, 11); \ + xm = T32(x3 + xm); \ + x3 = ROTL32(x3, 11); \ + xp = T32(xc + xp); \ + xc = ROTL32(xc, 11); \ + xo = T32(xd + xo); \ + xd = ROTL32(xd, 11); \ + xr = T32(xe + xr); \ + xe = ROTL32(xe, 11); \ + xq = T32(xf + xq); \ + xf = ROTL32(xf, 11); \ + xt = T32(x8 + xt); \ + x8 = ROTL32(x8, 11); \ + xs = T32(x9 + xs); \ + x9 = ROTL32(x9, 11); \ + xv = T32(xa + xv); \ + xa = ROTL32(xa, 11); \ + xu = T32(xb + xu); \ + xb = ROTL32(xb, 11); \ + x0 ^= xh; \ + x1 ^= xg; \ + x2 ^= xj; \ + x3 ^= xi; \ + x4 ^= xl; \ + x5 ^= xk; \ + x6 ^= xn; \ + x7 ^= xm; \ + x8 ^= xp; \ + x9 ^= xo; \ + xa ^= xr; \ + xb ^= xq; \ + xc ^= xt; \ + xd ^= xs; \ + xe ^= xv; \ + xf ^= xu; \ + } while (0) + +/* + * There is no need to unroll all 16 rounds. The word-swapping permutation + * is an involution, so we need to unroll an even number of rounds. On + * "big" systems, unrolling 4 rounds yields about 97% of the speed + * achieved with full unrolling; and it keeps the code more compact + * for small architectures. + */ + +#if SPH_CUBEHASH_UNROLL == 2 + +#define SIXTEEN_ROUNDS do { \ + int j; \ + for (j = 0; j < 8; j ++) { \ + ROUND_EVEN; \ + ROUND_ODD; \ + } \ + } while (0) + +#elif SPH_CUBEHASH_UNROLL == 4 + +#define SIXTEEN_ROUNDS do { \ + int j; \ + for (j = 0; j < 4; j ++) { \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + } \ + } while (0) + +#elif SPH_CUBEHASH_UNROLL == 8 + +#define SIXTEEN_ROUNDS do { \ + int j; \ + for (j = 0; j < 2; j ++) { \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + } \ + } while (0) + +#else + +#define SIXTEEN_ROUNDS do { \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + ROUND_EVEN; \ + ROUND_ODD; \ + } while (0) + +#endif + +static void +cubehash_init(sph_cubehash_context *sc, const sph_u32 *iv) +{ + memcpy(sc->state, iv, sizeof sc->state); + sc->ptr = 0; +} + +static void +cubehash_core(sph_cubehash_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + INPUT_BLOCK; + SIXTEEN_ROUNDS; + ptr = 0; + } + } + WRITE_STATE(sc); + sc->ptr = ptr; +} + +static void +cubehash_close(sph_cubehash_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32) +{ + unsigned char *buf, *out; + size_t ptr; + unsigned z; + int i; + DECL_STATE + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + READ_STATE(sc); + INPUT_BLOCK; + for (i = 0; i < 11; i ++) { + SIXTEEN_ROUNDS; + if (i == 0) + xv ^= SPH_C32(1); + } + WRITE_STATE(sc); + out = dst; + for (z = 0; z < out_size_w32; z ++) + sph_enc32le(out + (z << 2), sc->state[z]); +} + +/* see sph_cubehash.h */ +void +sph_cubehash224_init(void *cc) +{ + cubehash_init(cc, IV224); +} + +/* see sph_cubehash.h */ +void +sph_cubehash224(void *cc, const void *data, size_t len) +{ + cubehash_core(cc, data, len); +} + +/* see sph_cubehash.h */ +void +sph_cubehash224_close(void *cc, void *dst) +{ + sph_cubehash224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_cubehash.h */ +void +sph_cubehash224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + cubehash_close(cc, ub, n, dst, 7); + sph_cubehash224_init(cc); +} + +/* see sph_cubehash.h */ +void +sph_cubehash256_init(void *cc) +{ + cubehash_init(cc, IV256); +} + +/* see sph_cubehash.h */ +void +sph_cubehash256(void *cc, const void *data, size_t len) +{ + cubehash_core(cc, data, len); +} + +/* see sph_cubehash.h */ +void +sph_cubehash256_close(void *cc, void *dst) +{ + sph_cubehash256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_cubehash.h */ +void +sph_cubehash256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + cubehash_close(cc, ub, n, dst, 8); + sph_cubehash256_init(cc); +} + +/* see sph_cubehash.h */ +void +sph_cubehash384_init(void *cc) +{ + cubehash_init(cc, IV384); +} + +/* see sph_cubehash.h */ +void +sph_cubehash384(void *cc, const void *data, size_t len) +{ + cubehash_core(cc, data, len); +} + +/* see sph_cubehash.h */ +void +sph_cubehash384_close(void *cc, void *dst) +{ + sph_cubehash384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_cubehash.h */ +void +sph_cubehash384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + cubehash_close(cc, ub, n, dst, 12); + sph_cubehash384_init(cc); +} + +/* see sph_cubehash.h */ +void +sph_cubehash512_init(void *cc) +{ + cubehash_init(cc, IV512); +} + +/* see sph_cubehash.h */ +void +sph_cubehash512(void *cc, const void *data, size_t len) +{ + cubehash_core(cc, data, len); +} + +/* see sph_cubehash.h */ +void +sph_cubehash512_close(void *cc, void *dst) +{ + sph_cubehash512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_cubehash.h */ +void +sph_cubehash512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + cubehash_close(cc, ub, n, dst, 16); + sph_cubehash512_init(cc); +} +#ifdef __cplusplus +} +#endif diff --git a/sha3/sph_cubehash.h b/sha3/sph_cubehash.h new file mode 100644 index 00000000..487a1946 --- /dev/null +++ b/sha3/sph_cubehash.h @@ -0,0 +1,292 @@ +/* $Id: sph_cubehash.h 180 2010-05-08 02:29:25Z tp $ */ +/** + * CubeHash interface. CubeHash is a family of functions which differ by + * their output size; this implementation defines CubeHash for output + * sizes 224, 256, 384 and 512 bits, with the "standard parameters" + * (CubeHash16/32 with the CubeHash specification notations). + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_cubehash.h + * @author Thomas Pornin + */ + +#ifndef SPH_CUBEHASH_H__ +#define SPH_CUBEHASH_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for CubeHash-224. + */ +#define SPH_SIZE_cubehash224 224 + +/** + * Output size (in bits) for CubeHash-256. + */ +#define SPH_SIZE_cubehash256 256 + +/** + * Output size (in bits) for CubeHash-384. + */ +#define SPH_SIZE_cubehash384 384 + +/** + * Output size (in bits) for CubeHash-512. + */ +#define SPH_SIZE_cubehash512 512 + +/** + * This structure is a context for CubeHash computations: it contains the + * intermediate values and some data from the last entered block. Once + * a CubeHash computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running CubeHash computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[32]; /* first field, for alignment */ + size_t ptr; + sph_u32 state[32]; +#endif +} sph_cubehash_context; + +/** + * Type for a CubeHash-224 context (identical to the common context). + */ +typedef sph_cubehash_context sph_cubehash224_context; + +/** + * Type for a CubeHash-256 context (identical to the common context). + */ +typedef sph_cubehash_context sph_cubehash256_context; + +/** + * Type for a CubeHash-384 context (identical to the common context). + */ +typedef sph_cubehash_context sph_cubehash384_context; + +/** + * Type for a CubeHash-512 context (identical to the common context). + */ +typedef sph_cubehash_context sph_cubehash512_context; + +/** + * Initialize a CubeHash-224 context. This process performs no memory + * allocation. + * + * @param cc the CubeHash-224 context (pointer to a + * sph_cubehash224_context) + */ +void sph_cubehash224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the CubeHash-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_cubehash224(void *cc, const void *data, size_t len); + +/** + * Terminate the current CubeHash-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the CubeHash-224 context + * @param dst the destination buffer + */ +void sph_cubehash224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the CubeHash-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_cubehash224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a CubeHash-256 context. This process performs no memory + * allocation. + * + * @param cc the CubeHash-256 context (pointer to a + * sph_cubehash256_context) + */ +void sph_cubehash256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the CubeHash-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_cubehash256(void *cc, const void *data, size_t len); + +/** + * Terminate the current CubeHash-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the CubeHash-256 context + * @param dst the destination buffer + */ +void sph_cubehash256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the CubeHash-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_cubehash256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a CubeHash-384 context. This process performs no memory + * allocation. + * + * @param cc the CubeHash-384 context (pointer to a + * sph_cubehash384_context) + */ +void sph_cubehash384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the CubeHash-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_cubehash384(void *cc, const void *data, size_t len); + +/** + * Terminate the current CubeHash-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the CubeHash-384 context + * @param dst the destination buffer + */ +void sph_cubehash384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the CubeHash-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_cubehash384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a CubeHash-512 context. This process performs no memory + * allocation. + * + * @param cc the CubeHash-512 context (pointer to a + * sph_cubehash512_context) + */ +void sph_cubehash512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the CubeHash-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_cubehash512(void *cc, const void *data, size_t len); + +/** + * Terminate the current CubeHash-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the CubeHash-512 context + * @param dst the destination buffer + */ +void sph_cubehash512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the CubeHash-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_cubehash512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_echo.c b/sha3/sph_echo.c new file mode 100644 index 00000000..667e3f35 --- /dev/null +++ b/sha3/sph_echo.c @@ -0,0 +1,1031 @@ +/* $Id: echo.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * ECHO implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_echo.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_ECHO +#define SPH_SMALL_FOOTPRINT_ECHO 1 +#endif + +/* + * Some measures tend to show that the 64-bit implementation offers + * better performance only on a "64-bit architectures", those which have + * actual 64-bit registers. + */ +#if !defined SPH_ECHO_64 && SPH_64_TRUE +#define SPH_ECHO_64 1 +#endif + +/* + * We can use a 64-bit implementation only if a 64-bit type is available. + */ +#if !SPH_64 +#undef SPH_ECHO_64 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#define T32 SPH_T32 +#define C32 SPH_C32 +#if SPH_64 +#define C64 SPH_C64 +#endif + +#define AES_BIG_ENDIAN 0 +#include "aes_helper.c" + +#if SPH_ECHO_64 + +#define DECL_STATE_SMALL \ + sph_u64 W[16][2]; + +#define DECL_STATE_BIG \ + sph_u64 W[16][2]; + +#define INPUT_BLOCK_SMALL(sc) do { \ + unsigned u; \ + memcpy(W, sc->u.Vb, 8 * sizeof(sph_u64)); \ + for (u = 0; u < 12; u ++) { \ + W[u + 4][0] = sph_dec64le_aligned( \ + sc->buf + 16 * u); \ + W[u + 4][1] = sph_dec64le_aligned( \ + sc->buf + 16 * u + 8); \ + } \ + } while (0) + +#define INPUT_BLOCK_BIG(sc) do { \ + unsigned u; \ + memcpy(W, sc->u.Vb, 16 * sizeof(sph_u64)); \ + for (u = 0; u < 8; u ++) { \ + W[u + 8][0] = sph_dec64le_aligned( \ + sc->buf + 16 * u); \ + W[u + 8][1] = sph_dec64le_aligned( \ + sc->buf + 16 * u + 8); \ + } \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_ECHO + +static void +aes_2rounds_all(sph_u64 W[16][2], + sph_u32 *pK0, sph_u32 *pK1, sph_u32 *pK2, sph_u32 *pK3) +{ + int n; + sph_u32 K0 = *pK0; + sph_u32 K1 = *pK1; + sph_u32 K2 = *pK2; + sph_u32 K3 = *pK3; + + for (n = 0; n < 16; n ++) { + sph_u64 Wl = W[n][0]; + sph_u64 Wh = W[n][1]; + sph_u32 X0 = (sph_u32)Wl; + sph_u32 X1 = (sph_u32)(Wl >> 32); + sph_u32 X2 = (sph_u32)Wh; + sph_u32 X3 = (sph_u32)(Wh >> 32); + sph_u32 Y0, Y1, Y2, Y3; \ + AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3); + AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X0, X1, X2, X3); + W[n][0] = (sph_u64)X0 | ((sph_u64)X1 << 32); + W[n][1] = (sph_u64)X2 | ((sph_u64)X3 << 32); + if ((K0 = T32(K0 + 1)) == 0) { + if ((K1 = T32(K1 + 1)) == 0) + if ((K2 = T32(K2 + 1)) == 0) + K3 = T32(K3 + 1); + } + } + *pK0 = K0; + *pK1 = K1; + *pK2 = K2; + *pK3 = K3; +} + +#define BIG_SUB_WORDS do { \ + aes_2rounds_all(W, &K0, &K1, &K2, &K3); \ + } while (0) + +#else + +#define AES_2ROUNDS(X) do { \ + sph_u32 X0 = (sph_u32)(X[0]); \ + sph_u32 X1 = (sph_u32)(X[0] >> 32); \ + sph_u32 X2 = (sph_u32)(X[1]); \ + sph_u32 X3 = (sph_u32)(X[1] >> 32); \ + sph_u32 Y0, Y1, Y2, Y3; \ + AES_ROUND_LE(X0, X1, X2, X3, K0, K1, K2, K3, Y0, Y1, Y2, Y3); \ + AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X0, X1, X2, X3); \ + X[0] = (sph_u64)X0 | ((sph_u64)X1 << 32); \ + X[1] = (sph_u64)X2 | ((sph_u64)X3 << 32); \ + if ((K0 = T32(K0 + 1)) == 0) { \ + if ((K1 = T32(K1 + 1)) == 0) \ + if ((K2 = T32(K2 + 1)) == 0) \ + K3 = T32(K3 + 1); \ + } \ + } while (0) + +#define BIG_SUB_WORDS do { \ + AES_2ROUNDS(W[ 0]); \ + AES_2ROUNDS(W[ 1]); \ + AES_2ROUNDS(W[ 2]); \ + AES_2ROUNDS(W[ 3]); \ + AES_2ROUNDS(W[ 4]); \ + AES_2ROUNDS(W[ 5]); \ + AES_2ROUNDS(W[ 6]); \ + AES_2ROUNDS(W[ 7]); \ + AES_2ROUNDS(W[ 8]); \ + AES_2ROUNDS(W[ 9]); \ + AES_2ROUNDS(W[10]); \ + AES_2ROUNDS(W[11]); \ + AES_2ROUNDS(W[12]); \ + AES_2ROUNDS(W[13]); \ + AES_2ROUNDS(W[14]); \ + AES_2ROUNDS(W[15]); \ + } while (0) + +#endif + +#define SHIFT_ROW1(a, b, c, d) do { \ + sph_u64 tmp; \ + tmp = W[a][0]; \ + W[a][0] = W[b][0]; \ + W[b][0] = W[c][0]; \ + W[c][0] = W[d][0]; \ + W[d][0] = tmp; \ + tmp = W[a][1]; \ + W[a][1] = W[b][1]; \ + W[b][1] = W[c][1]; \ + W[c][1] = W[d][1]; \ + W[d][1] = tmp; \ + } while (0) + +#define SHIFT_ROW2(a, b, c, d) do { \ + sph_u64 tmp; \ + tmp = W[a][0]; \ + W[a][0] = W[c][0]; \ + W[c][0] = tmp; \ + tmp = W[b][0]; \ + W[b][0] = W[d][0]; \ + W[d][0] = tmp; \ + tmp = W[a][1]; \ + W[a][1] = W[c][1]; \ + W[c][1] = tmp; \ + tmp = W[b][1]; \ + W[b][1] = W[d][1]; \ + W[d][1] = tmp; \ + } while (0) + +#define SHIFT_ROW3(a, b, c, d) SHIFT_ROW1(d, c, b, a) + +#define BIG_SHIFT_ROWS do { \ + SHIFT_ROW1(1, 5, 9, 13); \ + SHIFT_ROW2(2, 6, 10, 14); \ + SHIFT_ROW3(3, 7, 11, 15); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_ECHO + +static void +mix_column(sph_u64 W[16][2], int ia, int ib, int ic, int id) +{ + int n; + + for (n = 0; n < 2; n ++) { + sph_u64 a = W[ia][n]; + sph_u64 b = W[ib][n]; + sph_u64 c = W[ic][n]; + sph_u64 d = W[id][n]; + sph_u64 ab = a ^ b; + sph_u64 bc = b ^ c; + sph_u64 cd = c ^ d; + sph_u64 abx = ((ab & C64(0x8080808080808080)) >> 7) * 27U + ^ ((ab & C64(0x7F7F7F7F7F7F7F7F)) << 1); + sph_u64 bcx = ((bc & C64(0x8080808080808080)) >> 7) * 27U + ^ ((bc & C64(0x7F7F7F7F7F7F7F7F)) << 1); + sph_u64 cdx = ((cd & C64(0x8080808080808080)) >> 7) * 27U + ^ ((cd & C64(0x7F7F7F7F7F7F7F7F)) << 1); + W[ia][n] = abx ^ bc ^ d; + W[ib][n] = bcx ^ a ^ cd; + W[ic][n] = cdx ^ ab ^ d; + W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; + } +} + +#define MIX_COLUMN(a, b, c, d) mix_column(W, a, b, c, d) + +#else + +#define MIX_COLUMN1(ia, ib, ic, id, n) do { \ + sph_u64 a = W[ia][n]; \ + sph_u64 b = W[ib][n]; \ + sph_u64 c = W[ic][n]; \ + sph_u64 d = W[id][n]; \ + sph_u64 ab = a ^ b; \ + sph_u64 bc = b ^ c; \ + sph_u64 cd = c ^ d; \ + sph_u64 abx = ((ab & C64(0x8080808080808080)) >> 7) * 27U \ + ^ ((ab & C64(0x7F7F7F7F7F7F7F7F)) << 1); \ + sph_u64 bcx = ((bc & C64(0x8080808080808080)) >> 7) * 27U \ + ^ ((bc & C64(0x7F7F7F7F7F7F7F7F)) << 1); \ + sph_u64 cdx = ((cd & C64(0x8080808080808080)) >> 7) * 27U \ + ^ ((cd & C64(0x7F7F7F7F7F7F7F7F)) << 1); \ + W[ia][n] = abx ^ bc ^ d; \ + W[ib][n] = bcx ^ a ^ cd; \ + W[ic][n] = cdx ^ ab ^ d; \ + W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; \ + } while (0) + +#define MIX_COLUMN(a, b, c, d) do { \ + MIX_COLUMN1(a, b, c, d, 0); \ + MIX_COLUMN1(a, b, c, d, 1); \ + } while (0) + +#endif + +#define BIG_MIX_COLUMNS do { \ + MIX_COLUMN(0, 1, 2, 3); \ + MIX_COLUMN(4, 5, 6, 7); \ + MIX_COLUMN(8, 9, 10, 11); \ + MIX_COLUMN(12, 13, 14, 15); \ + } while (0) + +#define BIG_ROUND do { \ + BIG_SUB_WORDS; \ + BIG_SHIFT_ROWS; \ + BIG_MIX_COLUMNS; \ + } while (0) + +#define FINAL_SMALL do { \ + unsigned u; \ + sph_u64 *VV = &sc->u.Vb[0][0]; \ + sph_u64 *WW = &W[0][0]; \ + for (u = 0; u < 8; u ++) { \ + VV[u] ^= sph_dec64le_aligned(sc->buf + (u * 8)) \ + ^ sph_dec64le_aligned(sc->buf + (u * 8) + 64) \ + ^ sph_dec64le_aligned(sc->buf + (u * 8) + 128) \ + ^ WW[u] ^ WW[u + 8] \ + ^ WW[u + 16] ^ WW[u + 24]; \ + } \ + } while (0) + +#define FINAL_BIG do { \ + unsigned u; \ + sph_u64 *VV = &sc->u.Vb[0][0]; \ + sph_u64 *WW = &W[0][0]; \ + for (u = 0; u < 16; u ++) { \ + VV[u] ^= sph_dec64le_aligned(sc->buf + (u * 8)) \ + ^ WW[u] ^ WW[u + 16]; \ + } \ + } while (0) + +#define COMPRESS_SMALL(sc) do { \ + sph_u32 K0 = sc->C0; \ + sph_u32 K1 = sc->C1; \ + sph_u32 K2 = sc->C2; \ + sph_u32 K3 = sc->C3; \ + unsigned u; \ + INPUT_BLOCK_SMALL(sc); \ + for (u = 0; u < 8; u ++) { \ + BIG_ROUND; \ + } \ + FINAL_SMALL; \ + } while (0) + +#define COMPRESS_BIG(sc) do { \ + sph_u32 K0 = sc->C0; \ + sph_u32 K1 = sc->C1; \ + sph_u32 K2 = sc->C2; \ + sph_u32 K3 = sc->C3; \ + unsigned u; \ + INPUT_BLOCK_BIG(sc); \ + for (u = 0; u < 10; u ++) { \ + BIG_ROUND; \ + } \ + FINAL_BIG; \ + } while (0) + +#else + +#define DECL_STATE_SMALL \ + sph_u32 W[16][4]; + +#define DECL_STATE_BIG \ + sph_u32 W[16][4]; + +#define INPUT_BLOCK_SMALL(sc) do { \ + unsigned u; \ + memcpy(W, sc->u.Vs, 16 * sizeof(sph_u32)); \ + for (u = 0; u < 12; u ++) { \ + W[u + 4][0] = sph_dec32le_aligned( \ + sc->buf + 16 * u); \ + W[u + 4][1] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 4); \ + W[u + 4][2] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 8); \ + W[u + 4][3] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 12); \ + } \ + } while (0) + +#define INPUT_BLOCK_BIG(sc) do { \ + unsigned u; \ + memcpy(W, sc->u.Vs, 32 * sizeof(sph_u32)); \ + for (u = 0; u < 8; u ++) { \ + W[u + 8][0] = sph_dec32le_aligned( \ + sc->buf + 16 * u); \ + W[u + 8][1] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 4); \ + W[u + 8][2] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 8); \ + W[u + 8][3] = sph_dec32le_aligned( \ + sc->buf + 16 * u + 12); \ + } \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_ECHO + +static void +aes_2rounds_all(sph_u32 W[16][4], + sph_u32 *pK0, sph_u32 *pK1, sph_u32 *pK2, sph_u32 *pK3) +{ + int n; + sph_u32 K0 = *pK0; + sph_u32 K1 = *pK1; + sph_u32 K2 = *pK2; + sph_u32 K3 = *pK3; + + for (n = 0; n < 16; n ++) { + sph_u32 *X = W[n]; + sph_u32 Y0, Y1, Y2, Y3; + AES_ROUND_LE(X[0], X[1], X[2], X[3], + K0, K1, K2, K3, Y0, Y1, Y2, Y3); + AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X[0], X[1], X[2], X[3]); + if ((K0 = T32(K0 + 1)) == 0) { + if ((K1 = T32(K1 + 1)) == 0) + if ((K2 = T32(K2 + 1)) == 0) + K3 = T32(K3 + 1); + } + } + *pK0 = K0; + *pK1 = K1; + *pK2 = K2; + *pK3 = K3; +} + +#define BIG_SUB_WORDS do { \ + aes_2rounds_all(W, &K0, &K1, &K2, &K3); \ + } while (0) + +#else + +#define AES_2ROUNDS(X) do { \ + sph_u32 Y0, Y1, Y2, Y3; \ + AES_ROUND_LE(X[0], X[1], X[2], X[3], \ + K0, K1, K2, K3, Y0, Y1, Y2, Y3); \ + AES_ROUND_NOKEY_LE(Y0, Y1, Y2, Y3, X[0], X[1], X[2], X[3]); \ + if ((K0 = T32(K0 + 1)) == 0) { \ + if ((K1 = T32(K1 + 1)) == 0) \ + if ((K2 = T32(K2 + 1)) == 0) \ + K3 = T32(K3 + 1); \ + } \ + } while (0) + +#define BIG_SUB_WORDS do { \ + AES_2ROUNDS(W[ 0]); \ + AES_2ROUNDS(W[ 1]); \ + AES_2ROUNDS(W[ 2]); \ + AES_2ROUNDS(W[ 3]); \ + AES_2ROUNDS(W[ 4]); \ + AES_2ROUNDS(W[ 5]); \ + AES_2ROUNDS(W[ 6]); \ + AES_2ROUNDS(W[ 7]); \ + AES_2ROUNDS(W[ 8]); \ + AES_2ROUNDS(W[ 9]); \ + AES_2ROUNDS(W[10]); \ + AES_2ROUNDS(W[11]); \ + AES_2ROUNDS(W[12]); \ + AES_2ROUNDS(W[13]); \ + AES_2ROUNDS(W[14]); \ + AES_2ROUNDS(W[15]); \ + } while (0) + +#endif + +#define SHIFT_ROW1(a, b, c, d) do { \ + sph_u32 tmp; \ + tmp = W[a][0]; \ + W[a][0] = W[b][0]; \ + W[b][0] = W[c][0]; \ + W[c][0] = W[d][0]; \ + W[d][0] = tmp; \ + tmp = W[a][1]; \ + W[a][1] = W[b][1]; \ + W[b][1] = W[c][1]; \ + W[c][1] = W[d][1]; \ + W[d][1] = tmp; \ + tmp = W[a][2]; \ + W[a][2] = W[b][2]; \ + W[b][2] = W[c][2]; \ + W[c][2] = W[d][2]; \ + W[d][2] = tmp; \ + tmp = W[a][3]; \ + W[a][3] = W[b][3]; \ + W[b][3] = W[c][3]; \ + W[c][3] = W[d][3]; \ + W[d][3] = tmp; \ + } while (0) + +#define SHIFT_ROW2(a, b, c, d) do { \ + sph_u32 tmp; \ + tmp = W[a][0]; \ + W[a][0] = W[c][0]; \ + W[c][0] = tmp; \ + tmp = W[b][0]; \ + W[b][0] = W[d][0]; \ + W[d][0] = tmp; \ + tmp = W[a][1]; \ + W[a][1] = W[c][1]; \ + W[c][1] = tmp; \ + tmp = W[b][1]; \ + W[b][1] = W[d][1]; \ + W[d][1] = tmp; \ + tmp = W[a][2]; \ + W[a][2] = W[c][2]; \ + W[c][2] = tmp; \ + tmp = W[b][2]; \ + W[b][2] = W[d][2]; \ + W[d][2] = tmp; \ + tmp = W[a][3]; \ + W[a][3] = W[c][3]; \ + W[c][3] = tmp; \ + tmp = W[b][3]; \ + W[b][3] = W[d][3]; \ + W[d][3] = tmp; \ + } while (0) + +#define SHIFT_ROW3(a, b, c, d) SHIFT_ROW1(d, c, b, a) + +#define BIG_SHIFT_ROWS do { \ + SHIFT_ROW1(1, 5, 9, 13); \ + SHIFT_ROW2(2, 6, 10, 14); \ + SHIFT_ROW3(3, 7, 11, 15); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_ECHO + +static void +mix_column(sph_u32 W[16][4], int ia, int ib, int ic, int id) +{ + int n; + + for (n = 0; n < 4; n ++) { + sph_u32 a = W[ia][n]; + sph_u32 b = W[ib][n]; + sph_u32 c = W[ic][n]; + sph_u32 d = W[id][n]; + sph_u32 ab = a ^ b; + sph_u32 bc = b ^ c; + sph_u32 cd = c ^ d; + sph_u32 abx = ((ab & C32(0x80808080)) >> 7) * 27U + ^ ((ab & C32(0x7F7F7F7F)) << 1); + sph_u32 bcx = ((bc & C32(0x80808080)) >> 7) * 27U + ^ ((bc & C32(0x7F7F7F7F)) << 1); + sph_u32 cdx = ((cd & C32(0x80808080)) >> 7) * 27U + ^ ((cd & C32(0x7F7F7F7F)) << 1); + W[ia][n] = abx ^ bc ^ d; + W[ib][n] = bcx ^ a ^ cd; + W[ic][n] = cdx ^ ab ^ d; + W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; + } +} + +#define MIX_COLUMN(a, b, c, d) mix_column(W, a, b, c, d) + +#else + +#define MIX_COLUMN1(ia, ib, ic, id, n) do { \ + sph_u32 a = W[ia][n]; \ + sph_u32 b = W[ib][n]; \ + sph_u32 c = W[ic][n]; \ + sph_u32 d = W[id][n]; \ + sph_u32 ab = a ^ b; \ + sph_u32 bc = b ^ c; \ + sph_u32 cd = c ^ d; \ + sph_u32 abx = ((ab & C32(0x80808080)) >> 7) * 27U \ + ^ ((ab & C32(0x7F7F7F7F)) << 1); \ + sph_u32 bcx = ((bc & C32(0x80808080)) >> 7) * 27U \ + ^ ((bc & C32(0x7F7F7F7F)) << 1); \ + sph_u32 cdx = ((cd & C32(0x80808080)) >> 7) * 27U \ + ^ ((cd & C32(0x7F7F7F7F)) << 1); \ + W[ia][n] = abx ^ bc ^ d; \ + W[ib][n] = bcx ^ a ^ cd; \ + W[ic][n] = cdx ^ ab ^ d; \ + W[id][n] = abx ^ bcx ^ cdx ^ ab ^ c; \ + } while (0) + +#define MIX_COLUMN(a, b, c, d) do { \ + MIX_COLUMN1(a, b, c, d, 0); \ + MIX_COLUMN1(a, b, c, d, 1); \ + MIX_COLUMN1(a, b, c, d, 2); \ + MIX_COLUMN1(a, b, c, d, 3); \ + } while (0) + +#endif + +#define BIG_MIX_COLUMNS do { \ + MIX_COLUMN(0, 1, 2, 3); \ + MIX_COLUMN(4, 5, 6, 7); \ + MIX_COLUMN(8, 9, 10, 11); \ + MIX_COLUMN(12, 13, 14, 15); \ + } while (0) + +#define BIG_ROUND do { \ + BIG_SUB_WORDS; \ + BIG_SHIFT_ROWS; \ + BIG_MIX_COLUMNS; \ + } while (0) + +#define FINAL_SMALL do { \ + unsigned u; \ + sph_u32 *VV = &sc->u.Vs[0][0]; \ + sph_u32 *WW = &W[0][0]; \ + for (u = 0; u < 16; u ++) { \ + VV[u] ^= sph_dec32le_aligned(sc->buf + (u * 4)) \ + ^ sph_dec32le_aligned(sc->buf + (u * 4) + 64) \ + ^ sph_dec32le_aligned(sc->buf + (u * 4) + 128) \ + ^ WW[u] ^ WW[u + 16] \ + ^ WW[u + 32] ^ WW[u + 48]; \ + } \ + } while (0) + +#define FINAL_BIG do { \ + unsigned u; \ + sph_u32 *VV = &sc->u.Vs[0][0]; \ + sph_u32 *WW = &W[0][0]; \ + for (u = 0; u < 32; u ++) { \ + VV[u] ^= sph_dec32le_aligned(sc->buf + (u * 4)) \ + ^ WW[u] ^ WW[u + 32]; \ + } \ + } while (0) + +#define COMPRESS_SMALL(sc) do { \ + sph_u32 K0 = sc->C0; \ + sph_u32 K1 = sc->C1; \ + sph_u32 K2 = sc->C2; \ + sph_u32 K3 = sc->C3; \ + unsigned u; \ + INPUT_BLOCK_SMALL(sc); \ + for (u = 0; u < 8; u ++) { \ + BIG_ROUND; \ + } \ + FINAL_SMALL; \ + } while (0) + +#define COMPRESS_BIG(sc) do { \ + sph_u32 K0 = sc->C0; \ + sph_u32 K1 = sc->C1; \ + sph_u32 K2 = sc->C2; \ + sph_u32 K3 = sc->C3; \ + unsigned u; \ + INPUT_BLOCK_BIG(sc); \ + for (u = 0; u < 10; u ++) { \ + BIG_ROUND; \ + } \ + FINAL_BIG; \ + } while (0) + +#endif + +#define INCR_COUNTER(sc, val) do { \ + sc->C0 = T32(sc->C0 + (sph_u32)(val)); \ + if (sc->C0 < (sph_u32)(val)) { \ + if ((sc->C1 = T32(sc->C1 + 1)) == 0) \ + if ((sc->C2 = T32(sc->C2 + 1)) == 0) \ + sc->C3 = T32(sc->C3 + 1); \ + } \ + } while (0) + +static void +echo_small_init(sph_echo_small_context *sc, unsigned out_len) +{ +#if SPH_ECHO_64 + sc->u.Vb[0][0] = (sph_u64)out_len; + sc->u.Vb[0][1] = 0; + sc->u.Vb[1][0] = (sph_u64)out_len; + sc->u.Vb[1][1] = 0; + sc->u.Vb[2][0] = (sph_u64)out_len; + sc->u.Vb[2][1] = 0; + sc->u.Vb[3][0] = (sph_u64)out_len; + sc->u.Vb[3][1] = 0; +#else + sc->u.Vs[0][0] = (sph_u32)out_len; + sc->u.Vs[0][1] = sc->u.Vs[0][2] = sc->u.Vs[0][3] = 0; + sc->u.Vs[1][0] = (sph_u32)out_len; + sc->u.Vs[1][1] = sc->u.Vs[1][2] = sc->u.Vs[1][3] = 0; + sc->u.Vs[2][0] = (sph_u32)out_len; + sc->u.Vs[2][1] = sc->u.Vs[2][2] = sc->u.Vs[2][3] = 0; + sc->u.Vs[3][0] = (sph_u32)out_len; + sc->u.Vs[3][1] = sc->u.Vs[3][2] = sc->u.Vs[3][3] = 0; +#endif + sc->ptr = 0; + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; +} + +static void +echo_big_init(sph_echo_big_context *sc, unsigned out_len) +{ +#if SPH_ECHO_64 + sc->u.Vb[0][0] = (sph_u64)out_len; + sc->u.Vb[0][1] = 0; + sc->u.Vb[1][0] = (sph_u64)out_len; + sc->u.Vb[1][1] = 0; + sc->u.Vb[2][0] = (sph_u64)out_len; + sc->u.Vb[2][1] = 0; + sc->u.Vb[3][0] = (sph_u64)out_len; + sc->u.Vb[3][1] = 0; + sc->u.Vb[4][0] = (sph_u64)out_len; + sc->u.Vb[4][1] = 0; + sc->u.Vb[5][0] = (sph_u64)out_len; + sc->u.Vb[5][1] = 0; + sc->u.Vb[6][0] = (sph_u64)out_len; + sc->u.Vb[6][1] = 0; + sc->u.Vb[7][0] = (sph_u64)out_len; + sc->u.Vb[7][1] = 0; +#else + sc->u.Vs[0][0] = (sph_u32)out_len; + sc->u.Vs[0][1] = sc->u.Vs[0][2] = sc->u.Vs[0][3] = 0; + sc->u.Vs[1][0] = (sph_u32)out_len; + sc->u.Vs[1][1] = sc->u.Vs[1][2] = sc->u.Vs[1][3] = 0; + sc->u.Vs[2][0] = (sph_u32)out_len; + sc->u.Vs[2][1] = sc->u.Vs[2][2] = sc->u.Vs[2][3] = 0; + sc->u.Vs[3][0] = (sph_u32)out_len; + sc->u.Vs[3][1] = sc->u.Vs[3][2] = sc->u.Vs[3][3] = 0; + sc->u.Vs[4][0] = (sph_u32)out_len; + sc->u.Vs[4][1] = sc->u.Vs[4][2] = sc->u.Vs[4][3] = 0; + sc->u.Vs[5][0] = (sph_u32)out_len; + sc->u.Vs[5][1] = sc->u.Vs[5][2] = sc->u.Vs[5][3] = 0; + sc->u.Vs[6][0] = (sph_u32)out_len; + sc->u.Vs[6][1] = sc->u.Vs[6][2] = sc->u.Vs[6][3] = 0; + sc->u.Vs[7][0] = (sph_u32)out_len; + sc->u.Vs[7][1] = sc->u.Vs[7][2] = sc->u.Vs[7][3] = 0; +#endif + sc->ptr = 0; + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; +} + +static void +echo_small_compress(sph_echo_small_context *sc) +{ + DECL_STATE_SMALL + + COMPRESS_SMALL(sc); +} + +static void +echo_big_compress(sph_echo_big_context *sc) +{ + DECL_STATE_BIG + + COMPRESS_BIG(sc); +} + +static void +echo_small_core(sph_echo_small_context *sc, + const unsigned char *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data += clen; + len -= clen; + if (ptr == sizeof sc->buf) { + INCR_COUNTER(sc, 1536); + echo_small_compress(sc); + ptr = 0; + } + } + sc->ptr = ptr; +} + +static void +echo_big_core(sph_echo_big_context *sc, + const unsigned char *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data += clen; + len -= clen; + if (ptr == sizeof sc->buf) { + INCR_COUNTER(sc, 1024); + echo_big_compress(sc); + ptr = 0; + } + } + sc->ptr = ptr; +} + +static void +echo_small_close(sph_echo_small_context *sc, unsigned ub, unsigned n, + void *dst, unsigned out_size_w32) +{ + unsigned char *buf; + size_t ptr; + unsigned z; + unsigned elen; + union { + unsigned char tmp[32]; + sph_u32 dummy; +#if SPH_ECHO_64 + sph_u64 dummy2; +#endif + } u; +#if SPH_ECHO_64 + sph_u64 *VV; +#else + sph_u32 *VV; +#endif + unsigned k; + + buf = sc->buf; + ptr = sc->ptr; + elen = ((unsigned)ptr << 3) + n; + INCR_COUNTER(sc, elen); + sph_enc32le_aligned(u.tmp, sc->C0); + sph_enc32le_aligned(u.tmp + 4, sc->C1); + sph_enc32le_aligned(u.tmp + 8, sc->C2); + sph_enc32le_aligned(u.tmp + 12, sc->C3); + /* + * If elen is zero, then this block actually contains no message + * bit, only the first padding bit. + */ + if (elen == 0) { + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; + } + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + if (ptr > ((sizeof sc->buf) - 18)) { + echo_small_compress(sc); + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; + memset(buf, 0, sizeof sc->buf); + } + sph_enc16le(buf + (sizeof sc->buf) - 18, out_size_w32 << 5); + memcpy(buf + (sizeof sc->buf) - 16, u.tmp, 16); + echo_small_compress(sc); +#if SPH_ECHO_64 + for (VV = &sc->u.Vb[0][0], k = 0; k < ((out_size_w32 + 1) >> 1); k ++) + sph_enc64le_aligned(u.tmp + (k << 3), VV[k]); +#else + for (VV = &sc->u.Vs[0][0], k = 0; k < out_size_w32; k ++) + sph_enc32le_aligned(u.tmp + (k << 2), VV[k]); +#endif + memcpy(dst, u.tmp, out_size_w32 << 2); + echo_small_init(sc, out_size_w32 << 5); +} + +static void +echo_big_close(sph_echo_big_context *sc, unsigned ub, unsigned n, + void *dst, unsigned out_size_w32) +{ + unsigned char *buf; + size_t ptr; + unsigned z; + unsigned elen; + union { + unsigned char tmp[64]; + sph_u32 dummy; +#if SPH_ECHO_64 + sph_u64 dummy2; +#endif + } u; +#if SPH_ECHO_64 + sph_u64 *VV; +#else + sph_u32 *VV; +#endif + unsigned k; + + buf = sc->buf; + ptr = sc->ptr; + elen = ((unsigned)ptr << 3) + n; + INCR_COUNTER(sc, elen); + sph_enc32le_aligned(u.tmp, sc->C0); + sph_enc32le_aligned(u.tmp + 4, sc->C1); + sph_enc32le_aligned(u.tmp + 8, sc->C2); + sph_enc32le_aligned(u.tmp + 12, sc->C3); + /* + * If elen is zero, then this block actually contains no message + * bit, only the first padding bit. + */ + if (elen == 0) { + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; + } + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + if (ptr > ((sizeof sc->buf) - 18)) { + echo_big_compress(sc); + sc->C0 = sc->C1 = sc->C2 = sc->C3 = 0; + memset(buf, 0, sizeof sc->buf); + } + sph_enc16le(buf + (sizeof sc->buf) - 18, out_size_w32 << 5); + memcpy(buf + (sizeof sc->buf) - 16, u.tmp, 16); + echo_big_compress(sc); +#if SPH_ECHO_64 + for (VV = &sc->u.Vb[0][0], k = 0; k < ((out_size_w32 + 1) >> 1); k ++) + sph_enc64le_aligned(u.tmp + (k << 3), VV[k]); +#else + for (VV = &sc->u.Vs[0][0], k = 0; k < out_size_w32; k ++) + sph_enc32le_aligned(u.tmp + (k << 2), VV[k]); +#endif + memcpy(dst, u.tmp, out_size_w32 << 2); + echo_big_init(sc, out_size_w32 << 5); +} + +/* see sph_echo.h */ +void +sph_echo224_init(void *cc) +{ + echo_small_init(cc, 224); +} + +/* see sph_echo.h */ +void +sph_echo224(void *cc, const void *data, size_t len) +{ + echo_small_core(cc, data, len); +} + +/* see sph_echo.h */ +void +sph_echo224_close(void *cc, void *dst) +{ + echo_small_close(cc, 0, 0, dst, 7); +} + +/* see sph_echo.h */ +void +sph_echo224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + echo_small_close(cc, ub, n, dst, 7); +} + +/* see sph_echo.h */ +void +sph_echo256_init(void *cc) +{ + echo_small_init(cc, 256); +} + +/* see sph_echo.h */ +void +sph_echo256(void *cc, const void *data, size_t len) +{ + echo_small_core(cc, data, len); +} + +/* see sph_echo.h */ +void +sph_echo256_close(void *cc, void *dst) +{ + echo_small_close(cc, 0, 0, dst, 8); +} + +/* see sph_echo.h */ +void +sph_echo256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + echo_small_close(cc, ub, n, dst, 8); +} + +/* see sph_echo.h */ +void +sph_echo384_init(void *cc) +{ + echo_big_init(cc, 384); +} + +/* see sph_echo.h */ +void +sph_echo384(void *cc, const void *data, size_t len) +{ + echo_big_core(cc, data, len); +} + +/* see sph_echo.h */ +void +sph_echo384_close(void *cc, void *dst) +{ + echo_big_close(cc, 0, 0, dst, 12); +} + +/* see sph_echo.h */ +void +sph_echo384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + echo_big_close(cc, ub, n, dst, 12); +} + +/* see sph_echo.h */ +void +sph_echo512_init(void *cc) +{ + echo_big_init(cc, 512); +} + +/* see sph_echo.h */ +void +sph_echo512(void *cc, const void *data, size_t len) +{ + echo_big_core(cc, data, len); +} + +/* see sph_echo.h */ +void +sph_echo512_close(void *cc, void *dst) +{ + echo_big_close(cc, 0, 0, dst, 16); +} + +/* see sph_echo.h */ +void +sph_echo512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + echo_big_close(cc, ub, n, dst, 16); +} +#ifdef __cplusplus +} +#endif diff --git a/sha3/sph_echo.h b/sha3/sph_echo.h new file mode 100644 index 00000000..1ae1e3dd --- /dev/null +++ b/sha3/sph_echo.h @@ -0,0 +1,320 @@ +/* $Id: sph_echo.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * ECHO interface. ECHO is a family of functions which differ by + * their output size; this implementation defines ECHO for output + * sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_echo.h + * @author Thomas Pornin + */ + +#ifndef SPH_ECHO_H__ +#define SPH_ECHO_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for ECHO-224. + */ +#define SPH_SIZE_echo224 224 + +/** + * Output size (in bits) for ECHO-256. + */ +#define SPH_SIZE_echo256 256 + +/** + * Output size (in bits) for ECHO-384. + */ +#define SPH_SIZE_echo384 384 + +/** + * Output size (in bits) for ECHO-512. + */ +#define SPH_SIZE_echo512 512 + +/** + * This structure is a context for ECHO computations: it contains the + * intermediate values and some data from the last entered block. Once + * an ECHO computation has been performed, the context can be reused for + * another computation. This specific structure is used for ECHO-224 + * and ECHO-256. + * + * The contents of this structure are private. A running ECHO computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[192]; /* first field, for alignment */ + size_t ptr; + union { + sph_u32 Vs[4][4]; +#if SPH_64 + sph_u64 Vb[4][2]; +#endif + } u; + sph_u32 C0, C1, C2, C3; +#endif +} sph_echo_small_context; + +/** + * This structure is a context for ECHO computations: it contains the + * intermediate values and some data from the last entered block. Once + * an ECHO computation has been performed, the context can be reused for + * another computation. This specific structure is used for ECHO-384 + * and ECHO-512. + * + * The contents of this structure are private. A running ECHO computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + union { + sph_u32 Vs[8][4]; +#if SPH_64 + sph_u64 Vb[8][2]; +#endif + } u; + sph_u32 C0, C1, C2, C3; +#endif +} sph_echo_big_context; + +/** + * Type for a ECHO-224 context (identical to the common "small" context). + */ +typedef sph_echo_small_context sph_echo224_context; + +/** + * Type for a ECHO-256 context (identical to the common "small" context). + */ +typedef sph_echo_small_context sph_echo256_context; + +/** + * Type for a ECHO-384 context (identical to the common "big" context). + */ +typedef sph_echo_big_context sph_echo384_context; + +/** + * Type for a ECHO-512 context (identical to the common "big" context). + */ +typedef sph_echo_big_context sph_echo512_context; + +/** + * Initialize an ECHO-224 context. This process performs no memory allocation. + * + * @param cc the ECHO-224 context (pointer to a + * sph_echo224_context) + */ +void sph_echo224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo224(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-224 context + * @param dst the destination buffer + */ +void sph_echo224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an ECHO-256 context. This process performs no memory allocation. + * + * @param cc the ECHO-256 context (pointer to a + * sph_echo256_context) + */ +void sph_echo256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo256(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-256 context + * @param dst the destination buffer + */ +void sph_echo256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an ECHO-384 context. This process performs no memory allocation. + * + * @param cc the ECHO-384 context (pointer to a + * sph_echo384_context) + */ +void sph_echo384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo384(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-384 context + * @param dst the destination buffer + */ +void sph_echo384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an ECHO-512 context. This process performs no memory allocation. + * + * @param cc the ECHO-512 context (pointer to a + * sph_echo512_context) + */ +void sph_echo512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the ECHO-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_echo512(void *cc, const void *data, size_t len); + +/** + * Terminate the current ECHO-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the ECHO-512 context + * @param dst the destination buffer + */ +void sph_echo512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the ECHO-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_echo512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_fugue.c b/sha3/sph_fugue.c new file mode 100644 index 00000000..390d2d16 --- /dev/null +++ b/sha3/sph_fugue.c @@ -0,0 +1,1208 @@ +#include +#include + +#include "sph_fugue.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static const sph_u32 IV224[] = { + SPH_C32(0xf4c9120d), SPH_C32(0x6286f757), SPH_C32(0xee39e01c), + SPH_C32(0xe074e3cb), SPH_C32(0xa1127c62), SPH_C32(0x9a43d215), + SPH_C32(0xbd8d679a) +}; + +static const sph_u32 IV256[] = { + SPH_C32(0xe952bdde), SPH_C32(0x6671135f), SPH_C32(0xe0d4f668), + SPH_C32(0xd2b0b594), SPH_C32(0xf96c621d), SPH_C32(0xfbf929de), + SPH_C32(0x9149e899), SPH_C32(0x34f8c248) +}; + +static const sph_u32 IV384[] = { + SPH_C32(0xaa61ec0d), SPH_C32(0x31252e1f), SPH_C32(0xa01db4c7), + SPH_C32(0x00600985), SPH_C32(0x215ef44a), SPH_C32(0x741b5e9c), + SPH_C32(0xfa693e9a), SPH_C32(0x473eb040), SPH_C32(0xe502ae8a), + SPH_C32(0xa99c25e0), SPH_C32(0xbc95517c), SPH_C32(0x5c1095a1) +}; + +static const sph_u32 IV512[] = { + SPH_C32(0x8807a57e), SPH_C32(0xe616af75), SPH_C32(0xc5d3e4db), + SPH_C32(0xac9ab027), SPH_C32(0xd915f117), SPH_C32(0xb6eecc54), + SPH_C32(0x06e8020b), SPH_C32(0x4a92efd1), SPH_C32(0xaac6e2c9), + SPH_C32(0xddb21398), SPH_C32(0xcae65838), SPH_C32(0x437f203f), + SPH_C32(0x25ea78e7), SPH_C32(0x951fddd6), SPH_C32(0xda6ed11d), + SPH_C32(0xe13e3567) +}; + +static const sph_u32 mixtab0[] = { + SPH_C32(0x63633297), SPH_C32(0x7c7c6feb), SPH_C32(0x77775ec7), + SPH_C32(0x7b7b7af7), SPH_C32(0xf2f2e8e5), SPH_C32(0x6b6b0ab7), + SPH_C32(0x6f6f16a7), SPH_C32(0xc5c56d39), SPH_C32(0x303090c0), + SPH_C32(0x01010704), SPH_C32(0x67672e87), SPH_C32(0x2b2bd1ac), + SPH_C32(0xfefeccd5), SPH_C32(0xd7d71371), SPH_C32(0xabab7c9a), + SPH_C32(0x767659c3), SPH_C32(0xcaca4005), SPH_C32(0x8282a33e), + SPH_C32(0xc9c94909), SPH_C32(0x7d7d68ef), SPH_C32(0xfafad0c5), + SPH_C32(0x5959947f), SPH_C32(0x4747ce07), SPH_C32(0xf0f0e6ed), + SPH_C32(0xadad6e82), SPH_C32(0xd4d41a7d), SPH_C32(0xa2a243be), + SPH_C32(0xafaf608a), SPH_C32(0x9c9cf946), SPH_C32(0xa4a451a6), + SPH_C32(0x727245d3), SPH_C32(0xc0c0762d), SPH_C32(0xb7b728ea), + SPH_C32(0xfdfdc5d9), SPH_C32(0x9393d47a), SPH_C32(0x2626f298), + SPH_C32(0x363682d8), SPH_C32(0x3f3fbdfc), SPH_C32(0xf7f7f3f1), + SPH_C32(0xcccc521d), SPH_C32(0x34348cd0), SPH_C32(0xa5a556a2), + SPH_C32(0xe5e58db9), SPH_C32(0xf1f1e1e9), SPH_C32(0x71714cdf), + SPH_C32(0xd8d83e4d), SPH_C32(0x313197c4), SPH_C32(0x15156b54), + SPH_C32(0x04041c10), SPH_C32(0xc7c76331), SPH_C32(0x2323e98c), + SPH_C32(0xc3c37f21), SPH_C32(0x18184860), SPH_C32(0x9696cf6e), + SPH_C32(0x05051b14), SPH_C32(0x9a9aeb5e), SPH_C32(0x0707151c), + SPH_C32(0x12127e48), SPH_C32(0x8080ad36), SPH_C32(0xe2e298a5), + SPH_C32(0xebeba781), SPH_C32(0x2727f59c), SPH_C32(0xb2b233fe), + SPH_C32(0x757550cf), SPH_C32(0x09093f24), SPH_C32(0x8383a43a), + SPH_C32(0x2c2cc4b0), SPH_C32(0x1a1a4668), SPH_C32(0x1b1b416c), + SPH_C32(0x6e6e11a3), SPH_C32(0x5a5a9d73), SPH_C32(0xa0a04db6), + SPH_C32(0x5252a553), SPH_C32(0x3b3ba1ec), SPH_C32(0xd6d61475), + SPH_C32(0xb3b334fa), SPH_C32(0x2929dfa4), SPH_C32(0xe3e39fa1), + SPH_C32(0x2f2fcdbc), SPH_C32(0x8484b126), SPH_C32(0x5353a257), + SPH_C32(0xd1d10169), SPH_C32(0x00000000), SPH_C32(0xededb599), + SPH_C32(0x2020e080), SPH_C32(0xfcfcc2dd), SPH_C32(0xb1b13af2), + SPH_C32(0x5b5b9a77), SPH_C32(0x6a6a0db3), SPH_C32(0xcbcb4701), + SPH_C32(0xbebe17ce), SPH_C32(0x3939afe4), SPH_C32(0x4a4aed33), + SPH_C32(0x4c4cff2b), SPH_C32(0x5858937b), SPH_C32(0xcfcf5b11), + SPH_C32(0xd0d0066d), SPH_C32(0xefefbb91), SPH_C32(0xaaaa7b9e), + SPH_C32(0xfbfbd7c1), SPH_C32(0x4343d217), SPH_C32(0x4d4df82f), + SPH_C32(0x333399cc), SPH_C32(0x8585b622), SPH_C32(0x4545c00f), + SPH_C32(0xf9f9d9c9), SPH_C32(0x02020e08), SPH_C32(0x7f7f66e7), + SPH_C32(0x5050ab5b), SPH_C32(0x3c3cb4f0), SPH_C32(0x9f9ff04a), + SPH_C32(0xa8a87596), SPH_C32(0x5151ac5f), SPH_C32(0xa3a344ba), + SPH_C32(0x4040db1b), SPH_C32(0x8f8f800a), SPH_C32(0x9292d37e), + SPH_C32(0x9d9dfe42), SPH_C32(0x3838a8e0), SPH_C32(0xf5f5fdf9), + SPH_C32(0xbcbc19c6), SPH_C32(0xb6b62fee), SPH_C32(0xdada3045), + SPH_C32(0x2121e784), SPH_C32(0x10107040), SPH_C32(0xffffcbd1), + SPH_C32(0xf3f3efe1), SPH_C32(0xd2d20865), SPH_C32(0xcdcd5519), + SPH_C32(0x0c0c2430), SPH_C32(0x1313794c), SPH_C32(0xececb29d), + SPH_C32(0x5f5f8667), SPH_C32(0x9797c86a), SPH_C32(0x4444c70b), + SPH_C32(0x1717655c), SPH_C32(0xc4c46a3d), SPH_C32(0xa7a758aa), + SPH_C32(0x7e7e61e3), SPH_C32(0x3d3db3f4), SPH_C32(0x6464278b), + SPH_C32(0x5d5d886f), SPH_C32(0x19194f64), SPH_C32(0x737342d7), + SPH_C32(0x60603b9b), SPH_C32(0x8181aa32), SPH_C32(0x4f4ff627), + SPH_C32(0xdcdc225d), SPH_C32(0x2222ee88), SPH_C32(0x2a2ad6a8), + SPH_C32(0x9090dd76), SPH_C32(0x88889516), SPH_C32(0x4646c903), + SPH_C32(0xeeeebc95), SPH_C32(0xb8b805d6), SPH_C32(0x14146c50), + SPH_C32(0xdede2c55), SPH_C32(0x5e5e8163), SPH_C32(0x0b0b312c), + SPH_C32(0xdbdb3741), SPH_C32(0xe0e096ad), SPH_C32(0x32329ec8), + SPH_C32(0x3a3aa6e8), SPH_C32(0x0a0a3628), SPH_C32(0x4949e43f), + SPH_C32(0x06061218), SPH_C32(0x2424fc90), SPH_C32(0x5c5c8f6b), + SPH_C32(0xc2c27825), SPH_C32(0xd3d30f61), SPH_C32(0xacac6986), + SPH_C32(0x62623593), SPH_C32(0x9191da72), SPH_C32(0x9595c662), + SPH_C32(0xe4e48abd), SPH_C32(0x797974ff), SPH_C32(0xe7e783b1), + SPH_C32(0xc8c84e0d), SPH_C32(0x373785dc), SPH_C32(0x6d6d18af), + SPH_C32(0x8d8d8e02), SPH_C32(0xd5d51d79), SPH_C32(0x4e4ef123), + SPH_C32(0xa9a97292), SPH_C32(0x6c6c1fab), SPH_C32(0x5656b943), + SPH_C32(0xf4f4fafd), SPH_C32(0xeaeaa085), SPH_C32(0x6565208f), + SPH_C32(0x7a7a7df3), SPH_C32(0xaeae678e), SPH_C32(0x08083820), + SPH_C32(0xbaba0bde), SPH_C32(0x787873fb), SPH_C32(0x2525fb94), + SPH_C32(0x2e2ecab8), SPH_C32(0x1c1c5470), SPH_C32(0xa6a65fae), + SPH_C32(0xb4b421e6), SPH_C32(0xc6c66435), SPH_C32(0xe8e8ae8d), + SPH_C32(0xdddd2559), SPH_C32(0x747457cb), SPH_C32(0x1f1f5d7c), + SPH_C32(0x4b4bea37), SPH_C32(0xbdbd1ec2), SPH_C32(0x8b8b9c1a), + SPH_C32(0x8a8a9b1e), SPH_C32(0x70704bdb), SPH_C32(0x3e3ebaf8), + SPH_C32(0xb5b526e2), SPH_C32(0x66662983), SPH_C32(0x4848e33b), + SPH_C32(0x0303090c), SPH_C32(0xf6f6f4f5), SPH_C32(0x0e0e2a38), + SPH_C32(0x61613c9f), SPH_C32(0x35358bd4), SPH_C32(0x5757be47), + SPH_C32(0xb9b902d2), SPH_C32(0x8686bf2e), SPH_C32(0xc1c17129), + SPH_C32(0x1d1d5374), SPH_C32(0x9e9ef74e), SPH_C32(0xe1e191a9), + SPH_C32(0xf8f8decd), SPH_C32(0x9898e556), SPH_C32(0x11117744), + SPH_C32(0x696904bf), SPH_C32(0xd9d93949), SPH_C32(0x8e8e870e), + SPH_C32(0x9494c166), SPH_C32(0x9b9bec5a), SPH_C32(0x1e1e5a78), + SPH_C32(0x8787b82a), SPH_C32(0xe9e9a989), SPH_C32(0xcece5c15), + SPH_C32(0x5555b04f), SPH_C32(0x2828d8a0), SPH_C32(0xdfdf2b51), + SPH_C32(0x8c8c8906), SPH_C32(0xa1a14ab2), SPH_C32(0x89899212), + SPH_C32(0x0d0d2334), SPH_C32(0xbfbf10ca), SPH_C32(0xe6e684b5), + SPH_C32(0x4242d513), SPH_C32(0x686803bb), SPH_C32(0x4141dc1f), + SPH_C32(0x9999e252), SPH_C32(0x2d2dc3b4), SPH_C32(0x0f0f2d3c), + SPH_C32(0xb0b03df6), SPH_C32(0x5454b74b), SPH_C32(0xbbbb0cda), + SPH_C32(0x16166258) +}; + +static const sph_u32 mixtab1[] = { + SPH_C32(0x97636332), SPH_C32(0xeb7c7c6f), SPH_C32(0xc777775e), + SPH_C32(0xf77b7b7a), SPH_C32(0xe5f2f2e8), SPH_C32(0xb76b6b0a), + SPH_C32(0xa76f6f16), SPH_C32(0x39c5c56d), SPH_C32(0xc0303090), + SPH_C32(0x04010107), SPH_C32(0x8767672e), SPH_C32(0xac2b2bd1), + SPH_C32(0xd5fefecc), SPH_C32(0x71d7d713), SPH_C32(0x9aabab7c), + SPH_C32(0xc3767659), SPH_C32(0x05caca40), SPH_C32(0x3e8282a3), + SPH_C32(0x09c9c949), SPH_C32(0xef7d7d68), SPH_C32(0xc5fafad0), + SPH_C32(0x7f595994), SPH_C32(0x074747ce), SPH_C32(0xedf0f0e6), + SPH_C32(0x82adad6e), SPH_C32(0x7dd4d41a), SPH_C32(0xbea2a243), + SPH_C32(0x8aafaf60), SPH_C32(0x469c9cf9), SPH_C32(0xa6a4a451), + SPH_C32(0xd3727245), SPH_C32(0x2dc0c076), SPH_C32(0xeab7b728), + SPH_C32(0xd9fdfdc5), SPH_C32(0x7a9393d4), SPH_C32(0x982626f2), + SPH_C32(0xd8363682), SPH_C32(0xfc3f3fbd), SPH_C32(0xf1f7f7f3), + SPH_C32(0x1dcccc52), SPH_C32(0xd034348c), SPH_C32(0xa2a5a556), + SPH_C32(0xb9e5e58d), SPH_C32(0xe9f1f1e1), SPH_C32(0xdf71714c), + SPH_C32(0x4dd8d83e), SPH_C32(0xc4313197), SPH_C32(0x5415156b), + SPH_C32(0x1004041c), SPH_C32(0x31c7c763), SPH_C32(0x8c2323e9), + SPH_C32(0x21c3c37f), SPH_C32(0x60181848), SPH_C32(0x6e9696cf), + SPH_C32(0x1405051b), SPH_C32(0x5e9a9aeb), SPH_C32(0x1c070715), + SPH_C32(0x4812127e), SPH_C32(0x368080ad), SPH_C32(0xa5e2e298), + SPH_C32(0x81ebeba7), SPH_C32(0x9c2727f5), SPH_C32(0xfeb2b233), + SPH_C32(0xcf757550), SPH_C32(0x2409093f), SPH_C32(0x3a8383a4), + SPH_C32(0xb02c2cc4), SPH_C32(0x681a1a46), SPH_C32(0x6c1b1b41), + SPH_C32(0xa36e6e11), SPH_C32(0x735a5a9d), SPH_C32(0xb6a0a04d), + SPH_C32(0x535252a5), SPH_C32(0xec3b3ba1), SPH_C32(0x75d6d614), + SPH_C32(0xfab3b334), SPH_C32(0xa42929df), SPH_C32(0xa1e3e39f), + SPH_C32(0xbc2f2fcd), SPH_C32(0x268484b1), SPH_C32(0x575353a2), + SPH_C32(0x69d1d101), SPH_C32(0x00000000), SPH_C32(0x99ededb5), + SPH_C32(0x802020e0), SPH_C32(0xddfcfcc2), SPH_C32(0xf2b1b13a), + SPH_C32(0x775b5b9a), SPH_C32(0xb36a6a0d), SPH_C32(0x01cbcb47), + SPH_C32(0xcebebe17), SPH_C32(0xe43939af), SPH_C32(0x334a4aed), + SPH_C32(0x2b4c4cff), SPH_C32(0x7b585893), SPH_C32(0x11cfcf5b), + SPH_C32(0x6dd0d006), SPH_C32(0x91efefbb), SPH_C32(0x9eaaaa7b), + SPH_C32(0xc1fbfbd7), SPH_C32(0x174343d2), SPH_C32(0x2f4d4df8), + SPH_C32(0xcc333399), SPH_C32(0x228585b6), SPH_C32(0x0f4545c0), + SPH_C32(0xc9f9f9d9), SPH_C32(0x0802020e), SPH_C32(0xe77f7f66), + SPH_C32(0x5b5050ab), SPH_C32(0xf03c3cb4), SPH_C32(0x4a9f9ff0), + SPH_C32(0x96a8a875), SPH_C32(0x5f5151ac), SPH_C32(0xbaa3a344), + SPH_C32(0x1b4040db), SPH_C32(0x0a8f8f80), SPH_C32(0x7e9292d3), + SPH_C32(0x429d9dfe), SPH_C32(0xe03838a8), SPH_C32(0xf9f5f5fd), + SPH_C32(0xc6bcbc19), SPH_C32(0xeeb6b62f), SPH_C32(0x45dada30), + SPH_C32(0x842121e7), SPH_C32(0x40101070), SPH_C32(0xd1ffffcb), + SPH_C32(0xe1f3f3ef), SPH_C32(0x65d2d208), SPH_C32(0x19cdcd55), + SPH_C32(0x300c0c24), SPH_C32(0x4c131379), SPH_C32(0x9dececb2), + SPH_C32(0x675f5f86), SPH_C32(0x6a9797c8), SPH_C32(0x0b4444c7), + SPH_C32(0x5c171765), SPH_C32(0x3dc4c46a), SPH_C32(0xaaa7a758), + SPH_C32(0xe37e7e61), SPH_C32(0xf43d3db3), SPH_C32(0x8b646427), + SPH_C32(0x6f5d5d88), SPH_C32(0x6419194f), SPH_C32(0xd7737342), + SPH_C32(0x9b60603b), SPH_C32(0x328181aa), SPH_C32(0x274f4ff6), + SPH_C32(0x5ddcdc22), SPH_C32(0x882222ee), SPH_C32(0xa82a2ad6), + SPH_C32(0x769090dd), SPH_C32(0x16888895), SPH_C32(0x034646c9), + SPH_C32(0x95eeeebc), SPH_C32(0xd6b8b805), SPH_C32(0x5014146c), + SPH_C32(0x55dede2c), SPH_C32(0x635e5e81), SPH_C32(0x2c0b0b31), + SPH_C32(0x41dbdb37), SPH_C32(0xade0e096), SPH_C32(0xc832329e), + SPH_C32(0xe83a3aa6), SPH_C32(0x280a0a36), SPH_C32(0x3f4949e4), + SPH_C32(0x18060612), SPH_C32(0x902424fc), SPH_C32(0x6b5c5c8f), + SPH_C32(0x25c2c278), SPH_C32(0x61d3d30f), SPH_C32(0x86acac69), + SPH_C32(0x93626235), SPH_C32(0x729191da), SPH_C32(0x629595c6), + SPH_C32(0xbde4e48a), SPH_C32(0xff797974), SPH_C32(0xb1e7e783), + SPH_C32(0x0dc8c84e), SPH_C32(0xdc373785), SPH_C32(0xaf6d6d18), + SPH_C32(0x028d8d8e), SPH_C32(0x79d5d51d), SPH_C32(0x234e4ef1), + SPH_C32(0x92a9a972), SPH_C32(0xab6c6c1f), SPH_C32(0x435656b9), + SPH_C32(0xfdf4f4fa), SPH_C32(0x85eaeaa0), SPH_C32(0x8f656520), + SPH_C32(0xf37a7a7d), SPH_C32(0x8eaeae67), SPH_C32(0x20080838), + SPH_C32(0xdebaba0b), SPH_C32(0xfb787873), SPH_C32(0x942525fb), + SPH_C32(0xb82e2eca), SPH_C32(0x701c1c54), SPH_C32(0xaea6a65f), + SPH_C32(0xe6b4b421), SPH_C32(0x35c6c664), SPH_C32(0x8de8e8ae), + SPH_C32(0x59dddd25), SPH_C32(0xcb747457), SPH_C32(0x7c1f1f5d), + SPH_C32(0x374b4bea), SPH_C32(0xc2bdbd1e), SPH_C32(0x1a8b8b9c), + SPH_C32(0x1e8a8a9b), SPH_C32(0xdb70704b), SPH_C32(0xf83e3eba), + SPH_C32(0xe2b5b526), SPH_C32(0x83666629), SPH_C32(0x3b4848e3), + SPH_C32(0x0c030309), SPH_C32(0xf5f6f6f4), SPH_C32(0x380e0e2a), + SPH_C32(0x9f61613c), SPH_C32(0xd435358b), SPH_C32(0x475757be), + SPH_C32(0xd2b9b902), SPH_C32(0x2e8686bf), SPH_C32(0x29c1c171), + SPH_C32(0x741d1d53), SPH_C32(0x4e9e9ef7), SPH_C32(0xa9e1e191), + SPH_C32(0xcdf8f8de), SPH_C32(0x569898e5), SPH_C32(0x44111177), + SPH_C32(0xbf696904), SPH_C32(0x49d9d939), SPH_C32(0x0e8e8e87), + SPH_C32(0x669494c1), SPH_C32(0x5a9b9bec), SPH_C32(0x781e1e5a), + SPH_C32(0x2a8787b8), SPH_C32(0x89e9e9a9), SPH_C32(0x15cece5c), + SPH_C32(0x4f5555b0), SPH_C32(0xa02828d8), SPH_C32(0x51dfdf2b), + SPH_C32(0x068c8c89), SPH_C32(0xb2a1a14a), SPH_C32(0x12898992), + SPH_C32(0x340d0d23), SPH_C32(0xcabfbf10), SPH_C32(0xb5e6e684), + SPH_C32(0x134242d5), SPH_C32(0xbb686803), SPH_C32(0x1f4141dc), + SPH_C32(0x529999e2), SPH_C32(0xb42d2dc3), SPH_C32(0x3c0f0f2d), + SPH_C32(0xf6b0b03d), SPH_C32(0x4b5454b7), SPH_C32(0xdabbbb0c), + SPH_C32(0x58161662) +}; + +static const sph_u32 mixtab2[] = { + SPH_C32(0x32976363), SPH_C32(0x6feb7c7c), SPH_C32(0x5ec77777), + SPH_C32(0x7af77b7b), SPH_C32(0xe8e5f2f2), SPH_C32(0x0ab76b6b), + SPH_C32(0x16a76f6f), SPH_C32(0x6d39c5c5), SPH_C32(0x90c03030), + SPH_C32(0x07040101), SPH_C32(0x2e876767), SPH_C32(0xd1ac2b2b), + SPH_C32(0xccd5fefe), SPH_C32(0x1371d7d7), SPH_C32(0x7c9aabab), + SPH_C32(0x59c37676), SPH_C32(0x4005caca), SPH_C32(0xa33e8282), + SPH_C32(0x4909c9c9), SPH_C32(0x68ef7d7d), SPH_C32(0xd0c5fafa), + SPH_C32(0x947f5959), SPH_C32(0xce074747), SPH_C32(0xe6edf0f0), + SPH_C32(0x6e82adad), SPH_C32(0x1a7dd4d4), SPH_C32(0x43bea2a2), + SPH_C32(0x608aafaf), SPH_C32(0xf9469c9c), SPH_C32(0x51a6a4a4), + SPH_C32(0x45d37272), SPH_C32(0x762dc0c0), SPH_C32(0x28eab7b7), + SPH_C32(0xc5d9fdfd), SPH_C32(0xd47a9393), SPH_C32(0xf2982626), + SPH_C32(0x82d83636), SPH_C32(0xbdfc3f3f), SPH_C32(0xf3f1f7f7), + SPH_C32(0x521dcccc), SPH_C32(0x8cd03434), SPH_C32(0x56a2a5a5), + SPH_C32(0x8db9e5e5), SPH_C32(0xe1e9f1f1), SPH_C32(0x4cdf7171), + SPH_C32(0x3e4dd8d8), SPH_C32(0x97c43131), SPH_C32(0x6b541515), + SPH_C32(0x1c100404), SPH_C32(0x6331c7c7), SPH_C32(0xe98c2323), + SPH_C32(0x7f21c3c3), SPH_C32(0x48601818), SPH_C32(0xcf6e9696), + SPH_C32(0x1b140505), SPH_C32(0xeb5e9a9a), SPH_C32(0x151c0707), + SPH_C32(0x7e481212), SPH_C32(0xad368080), SPH_C32(0x98a5e2e2), + SPH_C32(0xa781ebeb), SPH_C32(0xf59c2727), SPH_C32(0x33feb2b2), + SPH_C32(0x50cf7575), SPH_C32(0x3f240909), SPH_C32(0xa43a8383), + SPH_C32(0xc4b02c2c), SPH_C32(0x46681a1a), SPH_C32(0x416c1b1b), + SPH_C32(0x11a36e6e), SPH_C32(0x9d735a5a), SPH_C32(0x4db6a0a0), + SPH_C32(0xa5535252), SPH_C32(0xa1ec3b3b), SPH_C32(0x1475d6d6), + SPH_C32(0x34fab3b3), SPH_C32(0xdfa42929), SPH_C32(0x9fa1e3e3), + SPH_C32(0xcdbc2f2f), SPH_C32(0xb1268484), SPH_C32(0xa2575353), + SPH_C32(0x0169d1d1), SPH_C32(0x00000000), SPH_C32(0xb599eded), + SPH_C32(0xe0802020), SPH_C32(0xc2ddfcfc), SPH_C32(0x3af2b1b1), + SPH_C32(0x9a775b5b), SPH_C32(0x0db36a6a), SPH_C32(0x4701cbcb), + SPH_C32(0x17cebebe), SPH_C32(0xafe43939), SPH_C32(0xed334a4a), + SPH_C32(0xff2b4c4c), SPH_C32(0x937b5858), SPH_C32(0x5b11cfcf), + SPH_C32(0x066dd0d0), SPH_C32(0xbb91efef), SPH_C32(0x7b9eaaaa), + SPH_C32(0xd7c1fbfb), SPH_C32(0xd2174343), SPH_C32(0xf82f4d4d), + SPH_C32(0x99cc3333), SPH_C32(0xb6228585), SPH_C32(0xc00f4545), + SPH_C32(0xd9c9f9f9), SPH_C32(0x0e080202), SPH_C32(0x66e77f7f), + SPH_C32(0xab5b5050), SPH_C32(0xb4f03c3c), SPH_C32(0xf04a9f9f), + SPH_C32(0x7596a8a8), SPH_C32(0xac5f5151), SPH_C32(0x44baa3a3), + SPH_C32(0xdb1b4040), SPH_C32(0x800a8f8f), SPH_C32(0xd37e9292), + SPH_C32(0xfe429d9d), SPH_C32(0xa8e03838), SPH_C32(0xfdf9f5f5), + SPH_C32(0x19c6bcbc), SPH_C32(0x2feeb6b6), SPH_C32(0x3045dada), + SPH_C32(0xe7842121), SPH_C32(0x70401010), SPH_C32(0xcbd1ffff), + SPH_C32(0xefe1f3f3), SPH_C32(0x0865d2d2), SPH_C32(0x5519cdcd), + SPH_C32(0x24300c0c), SPH_C32(0x794c1313), SPH_C32(0xb29decec), + SPH_C32(0x86675f5f), SPH_C32(0xc86a9797), SPH_C32(0xc70b4444), + SPH_C32(0x655c1717), SPH_C32(0x6a3dc4c4), SPH_C32(0x58aaa7a7), + SPH_C32(0x61e37e7e), SPH_C32(0xb3f43d3d), SPH_C32(0x278b6464), + SPH_C32(0x886f5d5d), SPH_C32(0x4f641919), SPH_C32(0x42d77373), + SPH_C32(0x3b9b6060), SPH_C32(0xaa328181), SPH_C32(0xf6274f4f), + SPH_C32(0x225ddcdc), SPH_C32(0xee882222), SPH_C32(0xd6a82a2a), + SPH_C32(0xdd769090), SPH_C32(0x95168888), SPH_C32(0xc9034646), + SPH_C32(0xbc95eeee), SPH_C32(0x05d6b8b8), SPH_C32(0x6c501414), + SPH_C32(0x2c55dede), SPH_C32(0x81635e5e), SPH_C32(0x312c0b0b), + SPH_C32(0x3741dbdb), SPH_C32(0x96ade0e0), SPH_C32(0x9ec83232), + SPH_C32(0xa6e83a3a), SPH_C32(0x36280a0a), SPH_C32(0xe43f4949), + SPH_C32(0x12180606), SPH_C32(0xfc902424), SPH_C32(0x8f6b5c5c), + SPH_C32(0x7825c2c2), SPH_C32(0x0f61d3d3), SPH_C32(0x6986acac), + SPH_C32(0x35936262), SPH_C32(0xda729191), SPH_C32(0xc6629595), + SPH_C32(0x8abde4e4), SPH_C32(0x74ff7979), SPH_C32(0x83b1e7e7), + SPH_C32(0x4e0dc8c8), SPH_C32(0x85dc3737), SPH_C32(0x18af6d6d), + SPH_C32(0x8e028d8d), SPH_C32(0x1d79d5d5), SPH_C32(0xf1234e4e), + SPH_C32(0x7292a9a9), SPH_C32(0x1fab6c6c), SPH_C32(0xb9435656), + SPH_C32(0xfafdf4f4), SPH_C32(0xa085eaea), SPH_C32(0x208f6565), + SPH_C32(0x7df37a7a), SPH_C32(0x678eaeae), SPH_C32(0x38200808), + SPH_C32(0x0bdebaba), SPH_C32(0x73fb7878), SPH_C32(0xfb942525), + SPH_C32(0xcab82e2e), SPH_C32(0x54701c1c), SPH_C32(0x5faea6a6), + SPH_C32(0x21e6b4b4), SPH_C32(0x6435c6c6), SPH_C32(0xae8de8e8), + SPH_C32(0x2559dddd), SPH_C32(0x57cb7474), SPH_C32(0x5d7c1f1f), + SPH_C32(0xea374b4b), SPH_C32(0x1ec2bdbd), SPH_C32(0x9c1a8b8b), + SPH_C32(0x9b1e8a8a), SPH_C32(0x4bdb7070), SPH_C32(0xbaf83e3e), + SPH_C32(0x26e2b5b5), SPH_C32(0x29836666), SPH_C32(0xe33b4848), + SPH_C32(0x090c0303), SPH_C32(0xf4f5f6f6), SPH_C32(0x2a380e0e), + SPH_C32(0x3c9f6161), SPH_C32(0x8bd43535), SPH_C32(0xbe475757), + SPH_C32(0x02d2b9b9), SPH_C32(0xbf2e8686), SPH_C32(0x7129c1c1), + SPH_C32(0x53741d1d), SPH_C32(0xf74e9e9e), SPH_C32(0x91a9e1e1), + SPH_C32(0xdecdf8f8), SPH_C32(0xe5569898), SPH_C32(0x77441111), + SPH_C32(0x04bf6969), SPH_C32(0x3949d9d9), SPH_C32(0x870e8e8e), + SPH_C32(0xc1669494), SPH_C32(0xec5a9b9b), SPH_C32(0x5a781e1e), + SPH_C32(0xb82a8787), SPH_C32(0xa989e9e9), SPH_C32(0x5c15cece), + SPH_C32(0xb04f5555), SPH_C32(0xd8a02828), SPH_C32(0x2b51dfdf), + SPH_C32(0x89068c8c), SPH_C32(0x4ab2a1a1), SPH_C32(0x92128989), + SPH_C32(0x23340d0d), SPH_C32(0x10cabfbf), SPH_C32(0x84b5e6e6), + SPH_C32(0xd5134242), SPH_C32(0x03bb6868), SPH_C32(0xdc1f4141), + SPH_C32(0xe2529999), SPH_C32(0xc3b42d2d), SPH_C32(0x2d3c0f0f), + SPH_C32(0x3df6b0b0), SPH_C32(0xb74b5454), SPH_C32(0x0cdabbbb), + SPH_C32(0x62581616) +}; + +static const sph_u32 mixtab3[] = { + SPH_C32(0x63329763), SPH_C32(0x7c6feb7c), SPH_C32(0x775ec777), + SPH_C32(0x7b7af77b), SPH_C32(0xf2e8e5f2), SPH_C32(0x6b0ab76b), + SPH_C32(0x6f16a76f), SPH_C32(0xc56d39c5), SPH_C32(0x3090c030), + SPH_C32(0x01070401), SPH_C32(0x672e8767), SPH_C32(0x2bd1ac2b), + SPH_C32(0xfeccd5fe), SPH_C32(0xd71371d7), SPH_C32(0xab7c9aab), + SPH_C32(0x7659c376), SPH_C32(0xca4005ca), SPH_C32(0x82a33e82), + SPH_C32(0xc94909c9), SPH_C32(0x7d68ef7d), SPH_C32(0xfad0c5fa), + SPH_C32(0x59947f59), SPH_C32(0x47ce0747), SPH_C32(0xf0e6edf0), + SPH_C32(0xad6e82ad), SPH_C32(0xd41a7dd4), SPH_C32(0xa243bea2), + SPH_C32(0xaf608aaf), SPH_C32(0x9cf9469c), SPH_C32(0xa451a6a4), + SPH_C32(0x7245d372), SPH_C32(0xc0762dc0), SPH_C32(0xb728eab7), + SPH_C32(0xfdc5d9fd), SPH_C32(0x93d47a93), SPH_C32(0x26f29826), + SPH_C32(0x3682d836), SPH_C32(0x3fbdfc3f), SPH_C32(0xf7f3f1f7), + SPH_C32(0xcc521dcc), SPH_C32(0x348cd034), SPH_C32(0xa556a2a5), + SPH_C32(0xe58db9e5), SPH_C32(0xf1e1e9f1), SPH_C32(0x714cdf71), + SPH_C32(0xd83e4dd8), SPH_C32(0x3197c431), SPH_C32(0x156b5415), + SPH_C32(0x041c1004), SPH_C32(0xc76331c7), SPH_C32(0x23e98c23), + SPH_C32(0xc37f21c3), SPH_C32(0x18486018), SPH_C32(0x96cf6e96), + SPH_C32(0x051b1405), SPH_C32(0x9aeb5e9a), SPH_C32(0x07151c07), + SPH_C32(0x127e4812), SPH_C32(0x80ad3680), SPH_C32(0xe298a5e2), + SPH_C32(0xeba781eb), SPH_C32(0x27f59c27), SPH_C32(0xb233feb2), + SPH_C32(0x7550cf75), SPH_C32(0x093f2409), SPH_C32(0x83a43a83), + SPH_C32(0x2cc4b02c), SPH_C32(0x1a46681a), SPH_C32(0x1b416c1b), + SPH_C32(0x6e11a36e), SPH_C32(0x5a9d735a), SPH_C32(0xa04db6a0), + SPH_C32(0x52a55352), SPH_C32(0x3ba1ec3b), SPH_C32(0xd61475d6), + SPH_C32(0xb334fab3), SPH_C32(0x29dfa429), SPH_C32(0xe39fa1e3), + SPH_C32(0x2fcdbc2f), SPH_C32(0x84b12684), SPH_C32(0x53a25753), + SPH_C32(0xd10169d1), SPH_C32(0x00000000), SPH_C32(0xedb599ed), + SPH_C32(0x20e08020), SPH_C32(0xfcc2ddfc), SPH_C32(0xb13af2b1), + SPH_C32(0x5b9a775b), SPH_C32(0x6a0db36a), SPH_C32(0xcb4701cb), + SPH_C32(0xbe17cebe), SPH_C32(0x39afe439), SPH_C32(0x4aed334a), + SPH_C32(0x4cff2b4c), SPH_C32(0x58937b58), SPH_C32(0xcf5b11cf), + SPH_C32(0xd0066dd0), SPH_C32(0xefbb91ef), SPH_C32(0xaa7b9eaa), + SPH_C32(0xfbd7c1fb), SPH_C32(0x43d21743), SPH_C32(0x4df82f4d), + SPH_C32(0x3399cc33), SPH_C32(0x85b62285), SPH_C32(0x45c00f45), + SPH_C32(0xf9d9c9f9), SPH_C32(0x020e0802), SPH_C32(0x7f66e77f), + SPH_C32(0x50ab5b50), SPH_C32(0x3cb4f03c), SPH_C32(0x9ff04a9f), + SPH_C32(0xa87596a8), SPH_C32(0x51ac5f51), SPH_C32(0xa344baa3), + SPH_C32(0x40db1b40), SPH_C32(0x8f800a8f), SPH_C32(0x92d37e92), + SPH_C32(0x9dfe429d), SPH_C32(0x38a8e038), SPH_C32(0xf5fdf9f5), + SPH_C32(0xbc19c6bc), SPH_C32(0xb62feeb6), SPH_C32(0xda3045da), + SPH_C32(0x21e78421), SPH_C32(0x10704010), SPH_C32(0xffcbd1ff), + SPH_C32(0xf3efe1f3), SPH_C32(0xd20865d2), SPH_C32(0xcd5519cd), + SPH_C32(0x0c24300c), SPH_C32(0x13794c13), SPH_C32(0xecb29dec), + SPH_C32(0x5f86675f), SPH_C32(0x97c86a97), SPH_C32(0x44c70b44), + SPH_C32(0x17655c17), SPH_C32(0xc46a3dc4), SPH_C32(0xa758aaa7), + SPH_C32(0x7e61e37e), SPH_C32(0x3db3f43d), SPH_C32(0x64278b64), + SPH_C32(0x5d886f5d), SPH_C32(0x194f6419), SPH_C32(0x7342d773), + SPH_C32(0x603b9b60), SPH_C32(0x81aa3281), SPH_C32(0x4ff6274f), + SPH_C32(0xdc225ddc), SPH_C32(0x22ee8822), SPH_C32(0x2ad6a82a), + SPH_C32(0x90dd7690), SPH_C32(0x88951688), SPH_C32(0x46c90346), + SPH_C32(0xeebc95ee), SPH_C32(0xb805d6b8), SPH_C32(0x146c5014), + SPH_C32(0xde2c55de), SPH_C32(0x5e81635e), SPH_C32(0x0b312c0b), + SPH_C32(0xdb3741db), SPH_C32(0xe096ade0), SPH_C32(0x329ec832), + SPH_C32(0x3aa6e83a), SPH_C32(0x0a36280a), SPH_C32(0x49e43f49), + SPH_C32(0x06121806), SPH_C32(0x24fc9024), SPH_C32(0x5c8f6b5c), + SPH_C32(0xc27825c2), SPH_C32(0xd30f61d3), SPH_C32(0xac6986ac), + SPH_C32(0x62359362), SPH_C32(0x91da7291), SPH_C32(0x95c66295), + SPH_C32(0xe48abde4), SPH_C32(0x7974ff79), SPH_C32(0xe783b1e7), + SPH_C32(0xc84e0dc8), SPH_C32(0x3785dc37), SPH_C32(0x6d18af6d), + SPH_C32(0x8d8e028d), SPH_C32(0xd51d79d5), SPH_C32(0x4ef1234e), + SPH_C32(0xa97292a9), SPH_C32(0x6c1fab6c), SPH_C32(0x56b94356), + SPH_C32(0xf4fafdf4), SPH_C32(0xeaa085ea), SPH_C32(0x65208f65), + SPH_C32(0x7a7df37a), SPH_C32(0xae678eae), SPH_C32(0x08382008), + SPH_C32(0xba0bdeba), SPH_C32(0x7873fb78), SPH_C32(0x25fb9425), + SPH_C32(0x2ecab82e), SPH_C32(0x1c54701c), SPH_C32(0xa65faea6), + SPH_C32(0xb421e6b4), SPH_C32(0xc66435c6), SPH_C32(0xe8ae8de8), + SPH_C32(0xdd2559dd), SPH_C32(0x7457cb74), SPH_C32(0x1f5d7c1f), + SPH_C32(0x4bea374b), SPH_C32(0xbd1ec2bd), SPH_C32(0x8b9c1a8b), + SPH_C32(0x8a9b1e8a), SPH_C32(0x704bdb70), SPH_C32(0x3ebaf83e), + SPH_C32(0xb526e2b5), SPH_C32(0x66298366), SPH_C32(0x48e33b48), + SPH_C32(0x03090c03), SPH_C32(0xf6f4f5f6), SPH_C32(0x0e2a380e), + SPH_C32(0x613c9f61), SPH_C32(0x358bd435), SPH_C32(0x57be4757), + SPH_C32(0xb902d2b9), SPH_C32(0x86bf2e86), SPH_C32(0xc17129c1), + SPH_C32(0x1d53741d), SPH_C32(0x9ef74e9e), SPH_C32(0xe191a9e1), + SPH_C32(0xf8decdf8), SPH_C32(0x98e55698), SPH_C32(0x11774411), + SPH_C32(0x6904bf69), SPH_C32(0xd93949d9), SPH_C32(0x8e870e8e), + SPH_C32(0x94c16694), SPH_C32(0x9bec5a9b), SPH_C32(0x1e5a781e), + SPH_C32(0x87b82a87), SPH_C32(0xe9a989e9), SPH_C32(0xce5c15ce), + SPH_C32(0x55b04f55), SPH_C32(0x28d8a028), SPH_C32(0xdf2b51df), + SPH_C32(0x8c89068c), SPH_C32(0xa14ab2a1), SPH_C32(0x89921289), + SPH_C32(0x0d23340d), SPH_C32(0xbf10cabf), SPH_C32(0xe684b5e6), + SPH_C32(0x42d51342), SPH_C32(0x6803bb68), SPH_C32(0x41dc1f41), + SPH_C32(0x99e25299), SPH_C32(0x2dc3b42d), SPH_C32(0x0f2d3c0f), + SPH_C32(0xb03df6b0), SPH_C32(0x54b74b54), SPH_C32(0xbb0cdabb), + SPH_C32(0x16625816) +}; + +#define TIX2(q, x00, x01, x08, x10, x24) do { \ + x10 ^= x00; \ + x00 = (q); \ + x08 ^= x00; \ + x01 ^= x24; \ + } while (0) + +#define TIX3(q, x00, x01, x04, x08, x16, x27, x30) do { \ + x16 ^= x00; \ + x00 = (q); \ + x08 ^= x00; \ + x01 ^= x27; \ + x04 ^= x30; \ + } while (0) + +#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) do { \ + x22 ^= x00; \ + x00 = (q); \ + x08 ^= x00; \ + x01 ^= x24; \ + x04 ^= x27; \ + x07 ^= x30; \ + } while (0) + +#define CMIX30(x00, x01, x02, x04, x05, x06, x15, x16, x17) do { \ + x00 ^= x04; \ + x01 ^= x05; \ + x02 ^= x06; \ + x15 ^= x04; \ + x16 ^= x05; \ + x17 ^= x06; \ + } while (0) + +#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) do { \ + x00 ^= x04; \ + x01 ^= x05; \ + x02 ^= x06; \ + x18 ^= x04; \ + x19 ^= x05; \ + x20 ^= x06; \ + } while (0) + +#define SMIX(x0, x1, x2, x3) do { \ + sph_u32 c0 = 0; \ + sph_u32 c1 = 0; \ + sph_u32 c2 = 0; \ + sph_u32 c3 = 0; \ + sph_u32 r0 = 0; \ + sph_u32 r1 = 0; \ + sph_u32 r2 = 0; \ + sph_u32 r3 = 0; \ + sph_u32 tmp; \ + tmp = mixtab0[x0 >> 24]; \ + c0 ^= tmp; \ + tmp = mixtab1[(x0 >> 16) & 0xFF]; \ + c0 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2[(x0 >> 8) & 0xFF]; \ + c0 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3[x0 & 0xFF]; \ + c0 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0[x1 >> 24]; \ + c1 ^= tmp; \ + r0 ^= tmp; \ + tmp = mixtab1[(x1 >> 16) & 0xFF]; \ + c1 ^= tmp; \ + tmp = mixtab2[(x1 >> 8) & 0xFF]; \ + c1 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3[x1 & 0xFF]; \ + c1 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0[x2 >> 24]; \ + c2 ^= tmp; \ + r0 ^= tmp; \ + tmp = mixtab1[(x2 >> 16) & 0xFF]; \ + c2 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2[(x2 >> 8) & 0xFF]; \ + c2 ^= tmp; \ + tmp = mixtab3[x2 & 0xFF]; \ + c2 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0[x3 >> 24]; \ + c3 ^= tmp; \ + r0 ^= tmp; \ + tmp = mixtab1[(x3 >> 16) & 0xFF]; \ + c3 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2[(x3 >> 8) & 0xFF]; \ + c3 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3[x3 & 0xFF]; \ + c3 ^= tmp; \ + x0 = ((c0 ^ r0) & SPH_C32(0xFF000000)) \ + | ((c1 ^ r1) & SPH_C32(0x00FF0000)) \ + | ((c2 ^ r2) & SPH_C32(0x0000FF00)) \ + | ((c3 ^ r3) & SPH_C32(0x000000FF)); \ + x1 = ((c1 ^ (r0 << 8)) & SPH_C32(0xFF000000)) \ + | ((c2 ^ (r1 << 8)) & SPH_C32(0x00FF0000)) \ + | ((c3 ^ (r2 << 8)) & SPH_C32(0x0000FF00)) \ + | ((c0 ^ (r3 >> 24)) & SPH_C32(0x000000FF)); \ + x2 = ((c2 ^ (r0 << 16)) & SPH_C32(0xFF000000)) \ + | ((c3 ^ (r1 << 16)) & SPH_C32(0x00FF0000)) \ + | ((c0 ^ (r2 >> 16)) & SPH_C32(0x0000FF00)) \ + | ((c1 ^ (r3 >> 16)) & SPH_C32(0x000000FF)); \ + x3 = ((c3 ^ (r0 << 24)) & SPH_C32(0xFF000000)) \ + | ((c0 ^ (r1 >> 8)) & SPH_C32(0x00FF0000)) \ + | ((c1 ^ (r2 >> 8)) & SPH_C32(0x0000FF00)) \ + | ((c2 ^ (r3 >> 8)) & SPH_C32(0x000000FF)); \ + /* */ \ + } while (0) + +#if SPH_FUGUE_NOCOPY + +#define DECL_STATE_SMALL +#define READ_STATE_SMALL(state) +#define WRITE_STATE_SMALL(state) +#define DECL_STATE_BIG +#define READ_STATE_BIG(state) +#define WRITE_STATE_BIG(state) + +#define S00 ((sc)->S[ 0]) +#define S01 ((sc)->S[ 1]) +#define S02 ((sc)->S[ 2]) +#define S03 ((sc)->S[ 3]) +#define S04 ((sc)->S[ 4]) +#define S05 ((sc)->S[ 5]) +#define S06 ((sc)->S[ 6]) +#define S07 ((sc)->S[ 7]) +#define S08 ((sc)->S[ 8]) +#define S09 ((sc)->S[ 9]) +#define S10 ((sc)->S[10]) +#define S11 ((sc)->S[11]) +#define S12 ((sc)->S[12]) +#define S13 ((sc)->S[13]) +#define S14 ((sc)->S[14]) +#define S15 ((sc)->S[15]) +#define S16 ((sc)->S[16]) +#define S17 ((sc)->S[17]) +#define S18 ((sc)->S[18]) +#define S19 ((sc)->S[19]) +#define S20 ((sc)->S[20]) +#define S21 ((sc)->S[21]) +#define S22 ((sc)->S[22]) +#define S23 ((sc)->S[23]) +#define S24 ((sc)->S[24]) +#define S25 ((sc)->S[25]) +#define S26 ((sc)->S[26]) +#define S27 ((sc)->S[27]) +#define S28 ((sc)->S[28]) +#define S29 ((sc)->S[29]) +#define S30 ((sc)->S[30]) +#define S31 ((sc)->S[31]) +#define S32 ((sc)->S[32]) +#define S33 ((sc)->S[33]) +#define S34 ((sc)->S[34]) +#define S35 ((sc)->S[35]) + +#else + +#define DECL_STATE_SMALL \ + sph_u32 S00, S01, S02, S03, S04, S05, S06, S07, S08, S09; \ + sph_u32 S10, S11, S12, S13, S14, S15, S16, S17, S18, S19; \ + sph_u32 S20, S21, S22, S23, S24, S25, S26, S27, S28, S29; + +#define DECL_STATE_BIG \ + DECL_STATE_SMALL \ + sph_u32 S30, S31, S32, S33, S34, S35; + +#define READ_STATE_SMALL(state) do { \ + S00 = (state)->S[ 0]; \ + S01 = (state)->S[ 1]; \ + S02 = (state)->S[ 2]; \ + S03 = (state)->S[ 3]; \ + S04 = (state)->S[ 4]; \ + S05 = (state)->S[ 5]; \ + S06 = (state)->S[ 6]; \ + S07 = (state)->S[ 7]; \ + S08 = (state)->S[ 8]; \ + S09 = (state)->S[ 9]; \ + S10 = (state)->S[10]; \ + S11 = (state)->S[11]; \ + S12 = (state)->S[12]; \ + S13 = (state)->S[13]; \ + S14 = (state)->S[14]; \ + S15 = (state)->S[15]; \ + S16 = (state)->S[16]; \ + S17 = (state)->S[17]; \ + S18 = (state)->S[18]; \ + S19 = (state)->S[19]; \ + S20 = (state)->S[20]; \ + S21 = (state)->S[21]; \ + S22 = (state)->S[22]; \ + S23 = (state)->S[23]; \ + S24 = (state)->S[24]; \ + S25 = (state)->S[25]; \ + S26 = (state)->S[26]; \ + S27 = (state)->S[27]; \ + S28 = (state)->S[28]; \ + S29 = (state)->S[29]; \ + } while (0) + +#define READ_STATE_BIG(state) do { \ + READ_STATE_SMALL(state); \ + S30 = (state)->S[30]; \ + S31 = (state)->S[31]; \ + S32 = (state)->S[32]; \ + S33 = (state)->S[33]; \ + S34 = (state)->S[34]; \ + S35 = (state)->S[35]; \ + } while (0) + +#define WRITE_STATE_SMALL(state) do { \ + (state)->S[ 0] = S00; \ + (state)->S[ 1] = S01; \ + (state)->S[ 2] = S02; \ + (state)->S[ 3] = S03; \ + (state)->S[ 4] = S04; \ + (state)->S[ 5] = S05; \ + (state)->S[ 6] = S06; \ + (state)->S[ 7] = S07; \ + (state)->S[ 8] = S08; \ + (state)->S[ 9] = S09; \ + (state)->S[10] = S10; \ + (state)->S[11] = S11; \ + (state)->S[12] = S12; \ + (state)->S[13] = S13; \ + (state)->S[14] = S14; \ + (state)->S[15] = S15; \ + (state)->S[16] = S16; \ + (state)->S[17] = S17; \ + (state)->S[18] = S18; \ + (state)->S[19] = S19; \ + (state)->S[20] = S20; \ + (state)->S[21] = S21; \ + (state)->S[22] = S22; \ + (state)->S[23] = S23; \ + (state)->S[24] = S24; \ + (state)->S[25] = S25; \ + (state)->S[26] = S26; \ + (state)->S[27] = S27; \ + (state)->S[28] = S28; \ + (state)->S[29] = S29; \ + } while (0) + +#define WRITE_STATE_BIG(state) do { \ + WRITE_STATE_SMALL(state); \ + (state)->S[30] = S30; \ + (state)->S[31] = S31; \ + (state)->S[32] = S32; \ + (state)->S[33] = S33; \ + (state)->S[34] = S34; \ + (state)->S[35] = S35; \ + } while (0) + +#endif + +static void +fugue_init(sph_fugue_context *sc, size_t z_len, + const sph_u32 *iv, size_t iv_len) +{ + size_t u; + + for (u = 0; u < z_len; u ++) + sc->S[u] = 0; + memcpy(&sc->S[z_len], iv, iv_len * sizeof *iv); + sc->partial = 0; + sc->partial_len = 0; + sc->round_shift = 0; +#if SPH_64 + sc->bit_count = 0; +#else + sc->bit_count_high = 0; + sc->bit_count_low = 0; +#endif +} + +#if SPH_64 + +#define INCR_COUNTER do { \ + sc->bit_count += (sph_u64)len << 3; \ + } while (0) + +#else + +#define INCR_COUNTER do { \ + sph_u32 tmp = SPH_T32((sph_u32)len << 3); \ + sc->bit_count_low = SPH_T32(sc->bit_count_low + tmp); \ + if (sc->bit_count_low < tmp) \ + sc->bit_count_high ++; \ + sc->bit_count_high = SPH_T32(sc->bit_count_high \ + + ((sph_u32)len >> 29)); \ + } while (0) + +#endif + +#define CORE_ENTRY \ + sph_u32 p; \ + unsigned plen, rshift; \ + INCR_COUNTER; \ + p = sc->partial; \ + plen = sc->partial_len; \ + if (plen < 4) { \ + unsigned count = 4 - plen; \ + if (len < count) \ + count = len; \ + plen += count; \ + while (count -- > 0) { \ + p = (p << 8) | *(const unsigned char *)data; \ + data = (const unsigned char *)data + 1; \ + len --; \ + } \ + if (len == 0) { \ + sc->partial = p; \ + sc->partial_len = plen; \ + return; \ + } \ + } + +#define CORE_EXIT \ + p = 0; \ + sc->partial_len = (unsigned)len; \ + while (len -- > 0) { \ + p = (p << 8) | *(const unsigned char *)data; \ + data = (const unsigned char *)data + 1; \ + } \ + sc->partial = p; \ + sc->round_shift = rshift; + +/* + * Not in a do..while: the 'break' must exit the outer loop. + */ +#define NEXT(rc) \ + if (len <= 4) { \ + rshift = (rc); \ + break; \ + } \ + p = sph_dec32be(data); \ + data = (const unsigned char *)data + 4; \ + len -= 4 + +static void +fugue2_core(sph_fugue_context *sc, const void *data, size_t len) +{ + DECL_STATE_SMALL + CORE_ENTRY + READ_STATE_SMALL(sc); + rshift = sc->round_shift; + switch (rshift) { + for (;;) { + sph_u32 q; + + case 0: + q = p; + TIX2(q, S00, S01, S08, S10, S24); + CMIX30(S27, S28, S29, S01, S02, S03, S12, S13, S14); + SMIX(S27, S28, S29, S00); + CMIX30(S24, S25, S26, S28, S29, S00, S09, S10, S11); + SMIX(S24, S25, S26, S27); + NEXT(1); + /* fall through */ + case 1: + q = p; + TIX2(q, S24, S25, S02, S04, S18); + CMIX30(S21, S22, S23, S25, S26, S27, S06, S07, S08); + SMIX(S21, S22, S23, S24); + CMIX30(S18, S19, S20, S22, S23, S24, S03, S04, S05); + SMIX(S18, S19, S20, S21); + NEXT(2); + /* fall through */ + case 2: + q = p; + TIX2(q, S18, S19, S26, S28, S12); + CMIX30(S15, S16, S17, S19, S20, S21, S00, S01, S02); + SMIX(S15, S16, S17, S18); + CMIX30(S12, S13, S14, S16, S17, S18, S27, S28, S29); + SMIX(S12, S13, S14, S15); + NEXT(3); + /* fall through */ + case 3: + q = p; + TIX2(q, S12, S13, S20, S22, S06); + CMIX30(S09, S10, S11, S13, S14, S15, S24, S25, S26); + SMIX(S09, S10, S11, S12); + CMIX30(S06, S07, S08, S10, S11, S12, S21, S22, S23); + SMIX(S06, S07, S08, S09); + NEXT(4); + /* fall through */ + case 4: + q = p; + TIX2(q, S06, S07, S14, S16, S00); + CMIX30(S03, S04, S05, S07, S08, S09, S18, S19, S20); + SMIX(S03, S04, S05, S06); + CMIX30(S00, S01, S02, S04, S05, S06, S15, S16, S17); + SMIX(S00, S01, S02, S03); + NEXT(0); + } + } + CORE_EXIT + WRITE_STATE_SMALL(sc); +} + +static void +fugue3_core(sph_fugue_context *sc, const void *data, size_t len) +{ + DECL_STATE_BIG + CORE_ENTRY + READ_STATE_BIG(sc); + rshift = sc->round_shift; + switch (rshift) { + for (;;) { + sph_u32 q; + + case 0: + q = p; + TIX3(q, S00, S01, S04, S08, S16, S27, S30); + CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); + SMIX(S33, S34, S35, S00); + CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); + SMIX(S30, S31, S32, S33); + CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); + SMIX(S27, S28, S29, S30); + NEXT(1); + /* fall through */ + case 1: + q = p; + TIX3(q, S27, S28, S31, S35, S07, S18, S21); + CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); + SMIX(S24, S25, S26, S27); + CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); + SMIX(S21, S22, S23, S24); + CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); + SMIX(S18, S19, S20, S21); + NEXT(2); + /* fall through */ + case 2: + q = p; + TIX3(q, S18, S19, S22, S26, S34, S09, S12); + CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); + SMIX(S15, S16, S17, S18); + CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); + SMIX(S12, S13, S14, S15); + CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); + SMIX(S09, S10, S11, S12); + NEXT(3); + /* fall through */ + case 3: + q = p; + TIX3(q, S09, S10, S13, S17, S25, S00, S03); + CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); + SMIX(S06, S07, S08, S09); + CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); + SMIX(S03, S04, S05, S06); + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); + NEXT(0); + } + } + CORE_EXIT + WRITE_STATE_BIG(sc); +} + +static void +fugue4_core(sph_fugue_context *sc, const void *data, size_t len) +{ + DECL_STATE_BIG + CORE_ENTRY + READ_STATE_BIG(sc); + rshift = sc->round_shift; + switch (rshift) { + for (;;) { + sph_u32 q; + + case 0: + q = p; + TIX4(q, S00, S01, S04, S07, S08, S22, S24, S27, S30); + CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); + SMIX(S33, S34, S35, S00); + CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); + SMIX(S30, S31, S32, S33); + CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); + SMIX(S27, S28, S29, S30); + CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); + SMIX(S24, S25, S26, S27); + NEXT(1); + /* fall through */ + case 1: + q = p; + TIX4(q, S24, S25, S28, S31, S32, S10, S12, S15, S18); + CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); + SMIX(S21, S22, S23, S24); + CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); + SMIX(S18, S19, S20, S21); + CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); + SMIX(S15, S16, S17, S18); + CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); + SMIX(S12, S13, S14, S15); + NEXT(2); + /* fall through */ + case 2: + q = p; + TIX4(q, S12, S13, S16, S19, S20, S34, S00, S03, S06); + CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); + SMIX(S09, S10, S11, S12); + CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); + SMIX(S06, S07, S08, S09); + CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); + SMIX(S03, S04, S05, S06); + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); + NEXT(0); + } + } + CORE_EXIT + WRITE_STATE_BIG(sc); +} + +#if SPH_64 + +#define WRITE_COUNTER do { \ + sph_enc64be(buf + 4, sc->bit_count + n); \ + } while (0) + +#else + +#define WRITE_COUNTER do { \ + sph_enc32be(buf + 4, sc->bit_count_high); \ + sph_enc32be(buf + 8, sc->bit_count_low + n); \ + } while (0) + +#endif + +#define CLOSE_ENTRY(s, rcm, core) \ + unsigned char buf[16]; \ + unsigned plen, rms; \ + unsigned char *out; \ + sph_u32 S[s]; \ + plen = sc->partial_len; \ + WRITE_COUNTER; \ + if (plen == 0 && n == 0) { \ + plen = 4; \ + } else if (plen < 4 || n != 0) { \ + unsigned u; \ + \ + if (plen == 4) \ + plen = 0; \ + buf[plen] = ub & ~(0xFFU >> n); \ + for (u = plen + 1; u < 4; u ++) \ + buf[u] = 0; \ + } \ + core(sc, buf + plen, (sizeof buf) - plen); \ + rms = sc->round_shift * (rcm); \ + memcpy(S, sc->S + (s) - rms, rms * sizeof(sph_u32)); \ + memcpy(S + rms, sc->S, ((s) - rms) * sizeof(sph_u32)); + +#define ROR(n, s) do { \ + sph_u32 tmp[n]; \ + memcpy(tmp, S + ((s) - (n)), (n) * sizeof(sph_u32)); \ + memmove(S + (n), S, ((s) - (n)) * sizeof(sph_u32)); \ + memcpy(S, tmp, (n) * sizeof(sph_u32)); \ + } while (0) + +static void +fugue2_close(sph_fugue_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32) +{ + int i; + + CLOSE_ENTRY(30, 6, fugue2_core) + for (i = 0; i < 10; i ++) { + ROR(3, 30); + CMIX30(S[0], S[1], S[2], S[4], S[5], S[6], S[15], S[16], S[17]); + SMIX(S[0], S[1], S[2], S[3]); + } + for (i = 0; i < 13; i ++) { + S[4] ^= S[0]; + S[15] ^= S[0]; + ROR(15, 30); + SMIX(S[0], S[1], S[2], S[3]); + S[4] ^= S[0]; + S[16] ^= S[0]; + ROR(14, 30); + SMIX(S[0], S[1], S[2], S[3]); + } + S[4] ^= S[0]; + S[15] ^= S[0]; + out = dst; + sph_enc32be(out + 0, S[ 1]); + sph_enc32be(out + 4, S[ 2]); + sph_enc32be(out + 8, S[ 3]); + sph_enc32be(out + 12, S[ 4]); + sph_enc32be(out + 16, S[15]); + sph_enc32be(out + 20, S[16]); + sph_enc32be(out + 24, S[17]); + if (out_size_w32 == 8) { + sph_enc32be(out + 28, S[18]); + sph_fugue256_init(sc); + } else { + sph_fugue224_init(sc); + } +} + +static void +fugue3_close(sph_fugue_context *sc, unsigned ub, unsigned n, void *dst) +{ + int i; + + CLOSE_ENTRY(36, 9, fugue3_core) + for (i = 0; i < 18; i ++) { + ROR(3, 36); + CMIX36(S[0], S[1], S[2], S[4], S[5], S[6], S[18], S[19], S[20]); + SMIX(S[0], S[1], S[2], S[3]); + } + for (i = 0; i < 13; i ++) { + S[4] ^= S[0]; + S[12] ^= S[0]; + S[24] ^= S[0]; + ROR(12, 36); + SMIX(S[0], S[1], S[2], S[3]); + S[4] ^= S[0]; + S[13] ^= S[0]; + S[24] ^= S[0]; + ROR(12, 36); + SMIX(S[0], S[1], S[2], S[3]); + S[4] ^= S[0]; + S[13] ^= S[0]; + S[25] ^= S[0]; + ROR(11, 36); + SMIX(S[0], S[1], S[2], S[3]); + } + S[4] ^= S[0]; + S[12] ^= S[0]; + S[24] ^= S[0]; + out = dst; + sph_enc32be(out + 0, S[ 1]); + sph_enc32be(out + 4, S[ 2]); + sph_enc32be(out + 8, S[ 3]); + sph_enc32be(out + 12, S[ 4]); + sph_enc32be(out + 16, S[12]); + sph_enc32be(out + 20, S[13]); + sph_enc32be(out + 24, S[14]); + sph_enc32be(out + 28, S[15]); + sph_enc32be(out + 32, S[24]); + sph_enc32be(out + 36, S[25]); + sph_enc32be(out + 40, S[26]); + sph_enc32be(out + 44, S[27]); + sph_fugue384_init(sc); +} + +static void +fugue4_close(sph_fugue_context *sc, unsigned ub, unsigned n, void *dst) +{ + int i; + + CLOSE_ENTRY(36, 12, fugue4_core) + for (i = 0; i < 32; i ++) { + ROR(3, 36); + CMIX36(S[0], S[1], S[2], S[4], S[5], S[6], S[18], S[19], S[20]); + SMIX(S[0], S[1], S[2], S[3]); + } + for (i = 0; i < 13; i ++) { + S[4] ^= S[0]; + S[9] ^= S[0]; + S[18] ^= S[0]; + S[27] ^= S[0]; + ROR(9, 36); + SMIX(S[0], S[1], S[2], S[3]); + S[4] ^= S[0]; + S[10] ^= S[0]; + S[18] ^= S[0]; + S[27] ^= S[0]; + ROR(9, 36); + SMIX(S[0], S[1], S[2], S[3]); + S[4] ^= S[0]; + S[10] ^= S[0]; + S[19] ^= S[0]; + S[27] ^= S[0]; + ROR(9, 36); + SMIX(S[0], S[1], S[2], S[3]); + S[4] ^= S[0]; + S[10] ^= S[0]; + S[19] ^= S[0]; + S[28] ^= S[0]; + ROR(8, 36); + SMIX(S[0], S[1], S[2], S[3]); + } + S[4] ^= S[0]; + S[9] ^= S[0]; + S[18] ^= S[0]; + S[27] ^= S[0]; + out = dst; + sph_enc32be(out + 0, S[ 1]); + sph_enc32be(out + 4, S[ 2]); + sph_enc32be(out + 8, S[ 3]); + sph_enc32be(out + 12, S[ 4]); + sph_enc32be(out + 16, S[ 9]); + sph_enc32be(out + 20, S[10]); + sph_enc32be(out + 24, S[11]); + sph_enc32be(out + 28, S[12]); + sph_enc32be(out + 32, S[18]); + sph_enc32be(out + 36, S[19]); + sph_enc32be(out + 40, S[20]); + sph_enc32be(out + 44, S[21]); + sph_enc32be(out + 48, S[27]); + sph_enc32be(out + 52, S[28]); + sph_enc32be(out + 56, S[29]); + sph_enc32be(out + 60, S[30]); + sph_fugue512_init(sc); +} + +void +sph_fugue224_init(void *cc) +{ + fugue_init(cc, 23, IV224, 7); +} + +void +sph_fugue224(void *cc, const void *data, size_t len) +{ + fugue2_core(cc, data, len); +} + +void +sph_fugue224_close(void *cc, void *dst) +{ + fugue2_close(cc, 0, 0, dst, 7); +} + +void +sph_fugue224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + fugue2_close(cc, ub, n, dst, 7); +} + +void +sph_fugue256_init(void *cc) +{ + fugue_init(cc, 22, IV256, 8); +} + +void +sph_fugue256(void *cc, const void *data, size_t len) +{ + fugue2_core(cc, data, len); +} + +void +sph_fugue256_close(void *cc, void *dst) +{ + fugue2_close(cc, 0, 0, dst, 8); +} + +void +sph_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + fugue2_close(cc, ub, n, dst, 8); +} + +void +sph_fugue384_init(void *cc) +{ + fugue_init(cc, 24, IV384, 12); +} + +void +sph_fugue384(void *cc, const void *data, size_t len) +{ + fugue3_core(cc, data, len); +} + +void +sph_fugue384_close(void *cc, void *dst) +{ + fugue3_close(cc, 0, 0, dst); +} + +void +sph_fugue384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + fugue3_close(cc, ub, n, dst); +} + +void +sph_fugue512_init(void *cc) +{ + fugue_init(cc, 20, IV512, 16); +} + +void +sph_fugue512(void *cc, const void *data, size_t len) +{ + fugue4_core(cc, data, len); +} + +void +sph_fugue512_close(void *cc, void *dst) +{ + fugue4_close(cc, 0, 0, dst); +} + +void +sph_fugue512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + fugue4_close(cc, ub, n, dst); +} +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/sha3/sph_fugue.h b/sha3/sph_fugue.h new file mode 100644 index 00000000..84e60aaf --- /dev/null +++ b/sha3/sph_fugue.h @@ -0,0 +1,81 @@ +#ifndef SPH_FUGUE_H__ +#define SPH_FUGUE_H__ + +#include +#include "sph_types.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#define SPH_SIZE_fugue224 224 + +#define SPH_SIZE_fugue256 256 + +#define SPH_SIZE_fugue384 384 + +#define SPH_SIZE_fugue512 512 + +typedef struct { +#ifndef DOXYGEN_IGNORE + sph_u32 partial; + unsigned partial_len; + unsigned round_shift; + sph_u32 S[36]; +#if SPH_64 + sph_u64 bit_count; +#else + sph_u32 bit_count_high, bit_count_low; +#endif +#endif +} sph_fugue_context; + +typedef sph_fugue_context sph_fugue224_context; + +typedef sph_fugue_context sph_fugue256_context; + +typedef sph_fugue_context sph_fugue384_context; + +typedef sph_fugue_context sph_fugue512_context; + +void sph_fugue224_init(void *cc); + +void sph_fugue224(void *cc, const void *data, size_t len); + +void sph_fugue224_close(void *cc, void *dst); + +void sph_fugue224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +void sph_fugue256_init(void *cc); + +void sph_fugue256(void *cc, const void *data, size_t len); + +void sph_fugue256_close(void *cc, void *dst); + +void sph_fugue256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +void sph_fugue384_init(void *cc); + +void sph_fugue384(void *cc, const void *data, size_t len); + +void sph_fugue384_close(void *cc, void *dst); + +void sph_fugue384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +void sph_fugue512_init(void *cc); + +void sph_fugue512(void *cc, const void *data, size_t len); + +void sph_fugue512_close(void *cc, void *dst); + +void sph_fugue512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/sha3/sph_groestl.c b/sha3/sph_groestl.c new file mode 100644 index 00000000..91f75d32 --- /dev/null +++ b/sha3/sph_groestl.c @@ -0,0 +1,3119 @@ +/* $Id: groestl.c 260 2011-07-21 01:02:38Z tp $ */ +/* + * Groestl implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_groestl.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_GROESTL +#define SPH_SMALL_FOOTPRINT_GROESTL 1 +#endif + +/* + * Apparently, the 32-bit-only version is not faster than the 64-bit + * version unless using the "small footprint" code on a 32-bit machine. + */ +#if !defined SPH_GROESTL_64 +#if SPH_SMALL_FOOTPRINT_GROESTL && !SPH_64_TRUE +#define SPH_GROESTL_64 0 +#else +#define SPH_GROESTL_64 1 +#endif +#endif + +#if !SPH_64 +#undef SPH_GROESTL_64 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +/* + * The internal representation may use either big-endian or + * little-endian. Using the platform default representation speeds up + * encoding and decoding between bytes and the matrix columns. + */ + +#undef USE_LE +#if SPH_GROESTL_LITTLE_ENDIAN +#define USE_LE 1 +#elif SPH_GROESTL_BIG_ENDIAN +#define USE_LE 0 +#elif SPH_LITTLE_ENDIAN +#define USE_LE 1 +#endif + +#if USE_LE + +#define C32e(x) ((SPH_C32(x) >> 24) \ + | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ + | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ + | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) +#define dec32e_aligned sph_dec32le_aligned +#define enc32e sph_enc32le +#define B32_0(x) ((x) & 0xFF) +#define B32_1(x) (((x) >> 8) & 0xFF) +#define B32_2(x) (((x) >> 16) & 0xFF) +#define B32_3(x) ((x) >> 24) + +#define R32u(u, d) SPH_T32(((u) << 16) | ((d) >> 16)) +#define R32d(u, d) SPH_T32(((u) >> 16) | ((d) << 16)) + +#define PC32up(j, r) ((sph_u32)((j) + (r))) +#define PC32dn(j, r) 0 +#define QC32up(j, r) SPH_C32(0xFFFFFFFF) +#define QC32dn(j, r) (((sph_u32)(r) << 24) ^ SPH_T32(~((sph_u32)(j) << 24))) + +#if SPH_64 +#define C64e(x) ((SPH_C64(x) >> 56) \ + | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ + | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ + | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ + | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ + | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ + | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ + | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) +#define dec64e_aligned sph_dec64le_aligned +#define enc64e sph_enc64le +#define B64_0(x) ((x) & 0xFF) +#define B64_1(x) (((x) >> 8) & 0xFF) +#define B64_2(x) (((x) >> 16) & 0xFF) +#define B64_3(x) (((x) >> 24) & 0xFF) +#define B64_4(x) (((x) >> 32) & 0xFF) +#define B64_5(x) (((x) >> 40) & 0xFF) +#define B64_6(x) (((x) >> 48) & 0xFF) +#define B64_7(x) ((x) >> 56) +#define R64 SPH_ROTL64 +#define PC64(j, r) ((sph_u64)((j) + (r))) +#define QC64(j, r) (((sph_u64)(r) << 56) ^ SPH_T64(~((sph_u64)(j) << 56))) +#endif + +#else + +#define C32e(x) SPH_C32(x) +#define dec32e_aligned sph_dec32be_aligned +#define enc32e sph_enc32be +#define B32_0(x) ((x) >> 24) +#define B32_1(x) (((x) >> 16) & 0xFF) +#define B32_2(x) (((x) >> 8) & 0xFF) +#define B32_3(x) ((x) & 0xFF) + +#define R32u(u, d) SPH_T32(((u) >> 16) | ((d) << 16)) +#define R32d(u, d) SPH_T32(((u) << 16) | ((d) >> 16)) + +#define PC32up(j, r) ((sph_u32)((j) + (r)) << 24) +#define PC32dn(j, r) 0 +#define QC32up(j, r) SPH_C32(0xFFFFFFFF) +#define QC32dn(j, r) ((sph_u32)(r) ^ SPH_T32(~(sph_u32)(j))) + +#if SPH_64 +#define C64e(x) SPH_C64(x) +#define dec64e_aligned sph_dec64be_aligned +#define enc64e sph_enc64be +#define B64_0(x) ((x) >> 56) +#define B64_1(x) (((x) >> 48) & 0xFF) +#define B64_2(x) (((x) >> 40) & 0xFF) +#define B64_3(x) (((x) >> 32) & 0xFF) +#define B64_4(x) (((x) >> 24) & 0xFF) +#define B64_5(x) (((x) >> 16) & 0xFF) +#define B64_6(x) (((x) >> 8) & 0xFF) +#define B64_7(x) ((x) & 0xFF) +#define R64 SPH_ROTR64 +#define PC64(j, r) ((sph_u64)((j) + (r)) << 56) +#define QC64(j, r) ((sph_u64)(r) ^ SPH_T64(~(sph_u64)(j))) +#endif + +#endif + +#if SPH_GROESTL_64 + +static const sph_u64 T0[] = { + C64e(0xc632f4a5f497a5c6), C64e(0xf86f978497eb84f8), + C64e(0xee5eb099b0c799ee), C64e(0xf67a8c8d8cf78df6), + C64e(0xffe8170d17e50dff), C64e(0xd60adcbddcb7bdd6), + C64e(0xde16c8b1c8a7b1de), C64e(0x916dfc54fc395491), + C64e(0x6090f050f0c05060), C64e(0x0207050305040302), + C64e(0xce2ee0a9e087a9ce), C64e(0x56d1877d87ac7d56), + C64e(0xe7cc2b192bd519e7), C64e(0xb513a662a67162b5), + C64e(0x4d7c31e6319ae64d), C64e(0xec59b59ab5c39aec), + C64e(0x8f40cf45cf05458f), C64e(0x1fa3bc9dbc3e9d1f), + C64e(0x8949c040c0094089), C64e(0xfa68928792ef87fa), + C64e(0xefd03f153fc515ef), C64e(0xb29426eb267febb2), + C64e(0x8ece40c94007c98e), C64e(0xfbe61d0b1ded0bfb), + C64e(0x416e2fec2f82ec41), C64e(0xb31aa967a97d67b3), + C64e(0x5f431cfd1cbefd5f), C64e(0x456025ea258aea45), + C64e(0x23f9dabfda46bf23), C64e(0x535102f702a6f753), + C64e(0xe445a196a1d396e4), C64e(0x9b76ed5bed2d5b9b), + C64e(0x75285dc25deac275), C64e(0xe1c5241c24d91ce1), + C64e(0x3dd4e9aee97aae3d), C64e(0x4cf2be6abe986a4c), + C64e(0x6c82ee5aeed85a6c), C64e(0x7ebdc341c3fc417e), + C64e(0xf5f3060206f102f5), C64e(0x8352d14fd11d4f83), + C64e(0x688ce45ce4d05c68), C64e(0x515607f407a2f451), + C64e(0xd18d5c345cb934d1), C64e(0xf9e1180818e908f9), + C64e(0xe24cae93aedf93e2), C64e(0xab3e9573954d73ab), + C64e(0x6297f553f5c45362), C64e(0x2a6b413f41543f2a), + C64e(0x081c140c14100c08), C64e(0x9563f652f6315295), + C64e(0x46e9af65af8c6546), C64e(0x9d7fe25ee2215e9d), + C64e(0x3048782878602830), C64e(0x37cff8a1f86ea137), + C64e(0x0a1b110f11140f0a), C64e(0x2febc4b5c45eb52f), + C64e(0x0e151b091b1c090e), C64e(0x247e5a365a483624), + C64e(0x1badb69bb6369b1b), C64e(0xdf98473d47a53ddf), + C64e(0xcda76a266a8126cd), C64e(0x4ef5bb69bb9c694e), + C64e(0x7f334ccd4cfecd7f), C64e(0xea50ba9fbacf9fea), + C64e(0x123f2d1b2d241b12), C64e(0x1da4b99eb93a9e1d), + C64e(0x58c49c749cb07458), C64e(0x3446722e72682e34), + C64e(0x3641772d776c2d36), C64e(0xdc11cdb2cda3b2dc), + C64e(0xb49d29ee2973eeb4), C64e(0x5b4d16fb16b6fb5b), + C64e(0xa4a501f60153f6a4), C64e(0x76a1d74dd7ec4d76), + C64e(0xb714a361a37561b7), C64e(0x7d3449ce49face7d), + C64e(0x52df8d7b8da47b52), C64e(0xdd9f423e42a13edd), + C64e(0x5ecd937193bc715e), C64e(0x13b1a297a2269713), + C64e(0xa6a204f50457f5a6), C64e(0xb901b868b86968b9), + C64e(0x0000000000000000), C64e(0xc1b5742c74992cc1), + C64e(0x40e0a060a0806040), C64e(0xe3c2211f21dd1fe3), + C64e(0x793a43c843f2c879), C64e(0xb69a2ced2c77edb6), + C64e(0xd40dd9bed9b3bed4), C64e(0x8d47ca46ca01468d), + C64e(0x671770d970ced967), C64e(0x72afdd4bdde44b72), + C64e(0x94ed79de7933de94), C64e(0x98ff67d4672bd498), + C64e(0xb09323e8237be8b0), C64e(0x855bde4ade114a85), + C64e(0xbb06bd6bbd6d6bbb), C64e(0xc5bb7e2a7e912ac5), + C64e(0x4f7b34e5349ee54f), C64e(0xedd73a163ac116ed), + C64e(0x86d254c55417c586), C64e(0x9af862d7622fd79a), + C64e(0x6699ff55ffcc5566), C64e(0x11b6a794a7229411), + C64e(0x8ac04acf4a0fcf8a), C64e(0xe9d9301030c910e9), + C64e(0x040e0a060a080604), C64e(0xfe66988198e781fe), + C64e(0xa0ab0bf00b5bf0a0), C64e(0x78b4cc44ccf04478), + C64e(0x25f0d5bad54aba25), C64e(0x4b753ee33e96e34b), + C64e(0xa2ac0ef30e5ff3a2), C64e(0x5d4419fe19bafe5d), + C64e(0x80db5bc05b1bc080), C64e(0x0580858a850a8a05), + C64e(0x3fd3ecadec7ead3f), C64e(0x21fedfbcdf42bc21), + C64e(0x70a8d848d8e04870), C64e(0xf1fd0c040cf904f1), + C64e(0x63197adf7ac6df63), C64e(0x772f58c158eec177), + C64e(0xaf309f759f4575af), C64e(0x42e7a563a5846342), + C64e(0x2070503050403020), C64e(0xe5cb2e1a2ed11ae5), + C64e(0xfdef120e12e10efd), C64e(0xbf08b76db7656dbf), + C64e(0x8155d44cd4194c81), C64e(0x18243c143c301418), + C64e(0x26795f355f4c3526), C64e(0xc3b2712f719d2fc3), + C64e(0xbe8638e13867e1be), C64e(0x35c8fda2fd6aa235), + C64e(0x88c74fcc4f0bcc88), C64e(0x2e654b394b5c392e), + C64e(0x936af957f93d5793), C64e(0x55580df20daaf255), + C64e(0xfc619d829de382fc), C64e(0x7ab3c947c9f4477a), + C64e(0xc827efacef8bacc8), C64e(0xba8832e7326fe7ba), + C64e(0x324f7d2b7d642b32), C64e(0xe642a495a4d795e6), + C64e(0xc03bfba0fb9ba0c0), C64e(0x19aab398b3329819), + C64e(0x9ef668d16827d19e), C64e(0xa322817f815d7fa3), + C64e(0x44eeaa66aa886644), C64e(0x54d6827e82a87e54), + C64e(0x3bdde6abe676ab3b), C64e(0x0b959e839e16830b), + C64e(0x8cc945ca4503ca8c), C64e(0xc7bc7b297b9529c7), + C64e(0x6b056ed36ed6d36b), C64e(0x286c443c44503c28), + C64e(0xa72c8b798b5579a7), C64e(0xbc813de23d63e2bc), + C64e(0x1631271d272c1d16), C64e(0xad379a769a4176ad), + C64e(0xdb964d3b4dad3bdb), C64e(0x649efa56fac85664), + C64e(0x74a6d24ed2e84e74), C64e(0x1436221e22281e14), + C64e(0x92e476db763fdb92), C64e(0x0c121e0a1e180a0c), + C64e(0x48fcb46cb4906c48), C64e(0xb88f37e4376be4b8), + C64e(0x9f78e75de7255d9f), C64e(0xbd0fb26eb2616ebd), + C64e(0x43692aef2a86ef43), C64e(0xc435f1a6f193a6c4), + C64e(0x39dae3a8e372a839), C64e(0x31c6f7a4f762a431), + C64e(0xd38a593759bd37d3), C64e(0xf274868b86ff8bf2), + C64e(0xd583563256b132d5), C64e(0x8b4ec543c50d438b), + C64e(0x6e85eb59ebdc596e), C64e(0xda18c2b7c2afb7da), + C64e(0x018e8f8c8f028c01), C64e(0xb11dac64ac7964b1), + C64e(0x9cf16dd26d23d29c), C64e(0x49723be03b92e049), + C64e(0xd81fc7b4c7abb4d8), C64e(0xacb915fa1543faac), + C64e(0xf3fa090709fd07f3), C64e(0xcfa06f256f8525cf), + C64e(0xca20eaafea8fafca), C64e(0xf47d898e89f38ef4), + C64e(0x476720e9208ee947), C64e(0x1038281828201810), + C64e(0x6f0b64d564ded56f), C64e(0xf073838883fb88f0), + C64e(0x4afbb16fb1946f4a), C64e(0x5cca967296b8725c), + C64e(0x38546c246c702438), C64e(0x575f08f108aef157), + C64e(0x732152c752e6c773), C64e(0x9764f351f3355197), + C64e(0xcbae6523658d23cb), C64e(0xa125847c84597ca1), + C64e(0xe857bf9cbfcb9ce8), C64e(0x3e5d6321637c213e), + C64e(0x96ea7cdd7c37dd96), C64e(0x611e7fdc7fc2dc61), + C64e(0x0d9c9186911a860d), C64e(0x0f9b9485941e850f), + C64e(0xe04bab90abdb90e0), C64e(0x7cbac642c6f8427c), + C64e(0x712657c457e2c471), C64e(0xcc29e5aae583aacc), + C64e(0x90e373d8733bd890), C64e(0x06090f050f0c0506), + C64e(0xf7f4030103f501f7), C64e(0x1c2a36123638121c), + C64e(0xc23cfea3fe9fa3c2), C64e(0x6a8be15fe1d45f6a), + C64e(0xaebe10f91047f9ae), C64e(0x69026bd06bd2d069), + C64e(0x17bfa891a82e9117), C64e(0x9971e858e8295899), + C64e(0x3a5369276974273a), C64e(0x27f7d0b9d04eb927), + C64e(0xd991483848a938d9), C64e(0xebde351335cd13eb), + C64e(0x2be5ceb3ce56b32b), C64e(0x2277553355443322), + C64e(0xd204d6bbd6bfbbd2), C64e(0xa9399070904970a9), + C64e(0x07878089800e8907), C64e(0x33c1f2a7f266a733), + C64e(0x2decc1b6c15ab62d), C64e(0x3c5a66226678223c), + C64e(0x15b8ad92ad2a9215), C64e(0xc9a96020608920c9), + C64e(0x875cdb49db154987), C64e(0xaab01aff1a4fffaa), + C64e(0x50d8887888a07850), C64e(0xa52b8e7a8e517aa5), + C64e(0x03898a8f8a068f03), C64e(0x594a13f813b2f859), + C64e(0x09929b809b128009), C64e(0x1a2339173934171a), + C64e(0x651075da75cada65), C64e(0xd784533153b531d7), + C64e(0x84d551c65113c684), C64e(0xd003d3b8d3bbb8d0), + C64e(0x82dc5ec35e1fc382), C64e(0x29e2cbb0cb52b029), + C64e(0x5ac3997799b4775a), C64e(0x1e2d3311333c111e), + C64e(0x7b3d46cb46f6cb7b), C64e(0xa8b71ffc1f4bfca8), + C64e(0x6d0c61d661dad66d), C64e(0x2c624e3a4e583a2c) +}; + +#if !SPH_SMALL_FOOTPRINT_GROESTL + +static const sph_u64 T1[] = { + C64e(0xc6c632f4a5f497a5), C64e(0xf8f86f978497eb84), + C64e(0xeeee5eb099b0c799), C64e(0xf6f67a8c8d8cf78d), + C64e(0xffffe8170d17e50d), C64e(0xd6d60adcbddcb7bd), + C64e(0xdede16c8b1c8a7b1), C64e(0x91916dfc54fc3954), + C64e(0x606090f050f0c050), C64e(0x0202070503050403), + C64e(0xcece2ee0a9e087a9), C64e(0x5656d1877d87ac7d), + C64e(0xe7e7cc2b192bd519), C64e(0xb5b513a662a67162), + C64e(0x4d4d7c31e6319ae6), C64e(0xecec59b59ab5c39a), + C64e(0x8f8f40cf45cf0545), C64e(0x1f1fa3bc9dbc3e9d), + C64e(0x898949c040c00940), C64e(0xfafa68928792ef87), + C64e(0xefefd03f153fc515), C64e(0xb2b29426eb267feb), + C64e(0x8e8ece40c94007c9), C64e(0xfbfbe61d0b1ded0b), + C64e(0x41416e2fec2f82ec), C64e(0xb3b31aa967a97d67), + C64e(0x5f5f431cfd1cbefd), C64e(0x45456025ea258aea), + C64e(0x2323f9dabfda46bf), C64e(0x53535102f702a6f7), + C64e(0xe4e445a196a1d396), C64e(0x9b9b76ed5bed2d5b), + C64e(0x7575285dc25deac2), C64e(0xe1e1c5241c24d91c), + C64e(0x3d3dd4e9aee97aae), C64e(0x4c4cf2be6abe986a), + C64e(0x6c6c82ee5aeed85a), C64e(0x7e7ebdc341c3fc41), + C64e(0xf5f5f3060206f102), C64e(0x838352d14fd11d4f), + C64e(0x68688ce45ce4d05c), C64e(0x51515607f407a2f4), + C64e(0xd1d18d5c345cb934), C64e(0xf9f9e1180818e908), + C64e(0xe2e24cae93aedf93), C64e(0xabab3e9573954d73), + C64e(0x626297f553f5c453), C64e(0x2a2a6b413f41543f), + C64e(0x08081c140c14100c), C64e(0x959563f652f63152), + C64e(0x4646e9af65af8c65), C64e(0x9d9d7fe25ee2215e), + C64e(0x3030487828786028), C64e(0x3737cff8a1f86ea1), + C64e(0x0a0a1b110f11140f), C64e(0x2f2febc4b5c45eb5), + C64e(0x0e0e151b091b1c09), C64e(0x24247e5a365a4836), + C64e(0x1b1badb69bb6369b), C64e(0xdfdf98473d47a53d), + C64e(0xcdcda76a266a8126), C64e(0x4e4ef5bb69bb9c69), + C64e(0x7f7f334ccd4cfecd), C64e(0xeaea50ba9fbacf9f), + C64e(0x12123f2d1b2d241b), C64e(0x1d1da4b99eb93a9e), + C64e(0x5858c49c749cb074), C64e(0x343446722e72682e), + C64e(0x363641772d776c2d), C64e(0xdcdc11cdb2cda3b2), + C64e(0xb4b49d29ee2973ee), C64e(0x5b5b4d16fb16b6fb), + C64e(0xa4a4a501f60153f6), C64e(0x7676a1d74dd7ec4d), + C64e(0xb7b714a361a37561), C64e(0x7d7d3449ce49face), + C64e(0x5252df8d7b8da47b), C64e(0xdddd9f423e42a13e), + C64e(0x5e5ecd937193bc71), C64e(0x1313b1a297a22697), + C64e(0xa6a6a204f50457f5), C64e(0xb9b901b868b86968), + C64e(0x0000000000000000), C64e(0xc1c1b5742c74992c), + C64e(0x4040e0a060a08060), C64e(0xe3e3c2211f21dd1f), + C64e(0x79793a43c843f2c8), C64e(0xb6b69a2ced2c77ed), + C64e(0xd4d40dd9bed9b3be), C64e(0x8d8d47ca46ca0146), + C64e(0x67671770d970ced9), C64e(0x7272afdd4bdde44b), + C64e(0x9494ed79de7933de), C64e(0x9898ff67d4672bd4), + C64e(0xb0b09323e8237be8), C64e(0x85855bde4ade114a), + C64e(0xbbbb06bd6bbd6d6b), C64e(0xc5c5bb7e2a7e912a), + C64e(0x4f4f7b34e5349ee5), C64e(0xededd73a163ac116), + C64e(0x8686d254c55417c5), C64e(0x9a9af862d7622fd7), + C64e(0x666699ff55ffcc55), C64e(0x1111b6a794a72294), + C64e(0x8a8ac04acf4a0fcf), C64e(0xe9e9d9301030c910), + C64e(0x04040e0a060a0806), C64e(0xfefe66988198e781), + C64e(0xa0a0ab0bf00b5bf0), C64e(0x7878b4cc44ccf044), + C64e(0x2525f0d5bad54aba), C64e(0x4b4b753ee33e96e3), + C64e(0xa2a2ac0ef30e5ff3), C64e(0x5d5d4419fe19bafe), + C64e(0x8080db5bc05b1bc0), C64e(0x050580858a850a8a), + C64e(0x3f3fd3ecadec7ead), C64e(0x2121fedfbcdf42bc), + C64e(0x7070a8d848d8e048), C64e(0xf1f1fd0c040cf904), + C64e(0x6363197adf7ac6df), C64e(0x77772f58c158eec1), + C64e(0xafaf309f759f4575), C64e(0x4242e7a563a58463), + C64e(0x2020705030504030), C64e(0xe5e5cb2e1a2ed11a), + C64e(0xfdfdef120e12e10e), C64e(0xbfbf08b76db7656d), + C64e(0x818155d44cd4194c), C64e(0x1818243c143c3014), + C64e(0x2626795f355f4c35), C64e(0xc3c3b2712f719d2f), + C64e(0xbebe8638e13867e1), C64e(0x3535c8fda2fd6aa2), + C64e(0x8888c74fcc4f0bcc), C64e(0x2e2e654b394b5c39), + C64e(0x93936af957f93d57), C64e(0x5555580df20daaf2), + C64e(0xfcfc619d829de382), C64e(0x7a7ab3c947c9f447), + C64e(0xc8c827efacef8bac), C64e(0xbaba8832e7326fe7), + C64e(0x32324f7d2b7d642b), C64e(0xe6e642a495a4d795), + C64e(0xc0c03bfba0fb9ba0), C64e(0x1919aab398b33298), + C64e(0x9e9ef668d16827d1), C64e(0xa3a322817f815d7f), + C64e(0x4444eeaa66aa8866), C64e(0x5454d6827e82a87e), + C64e(0x3b3bdde6abe676ab), C64e(0x0b0b959e839e1683), + C64e(0x8c8cc945ca4503ca), C64e(0xc7c7bc7b297b9529), + C64e(0x6b6b056ed36ed6d3), C64e(0x28286c443c44503c), + C64e(0xa7a72c8b798b5579), C64e(0xbcbc813de23d63e2), + C64e(0x161631271d272c1d), C64e(0xadad379a769a4176), + C64e(0xdbdb964d3b4dad3b), C64e(0x64649efa56fac856), + C64e(0x7474a6d24ed2e84e), C64e(0x141436221e22281e), + C64e(0x9292e476db763fdb), C64e(0x0c0c121e0a1e180a), + C64e(0x4848fcb46cb4906c), C64e(0xb8b88f37e4376be4), + C64e(0x9f9f78e75de7255d), C64e(0xbdbd0fb26eb2616e), + C64e(0x4343692aef2a86ef), C64e(0xc4c435f1a6f193a6), + C64e(0x3939dae3a8e372a8), C64e(0x3131c6f7a4f762a4), + C64e(0xd3d38a593759bd37), C64e(0xf2f274868b86ff8b), + C64e(0xd5d583563256b132), C64e(0x8b8b4ec543c50d43), + C64e(0x6e6e85eb59ebdc59), C64e(0xdada18c2b7c2afb7), + C64e(0x01018e8f8c8f028c), C64e(0xb1b11dac64ac7964), + C64e(0x9c9cf16dd26d23d2), C64e(0x4949723be03b92e0), + C64e(0xd8d81fc7b4c7abb4), C64e(0xacacb915fa1543fa), + C64e(0xf3f3fa090709fd07), C64e(0xcfcfa06f256f8525), + C64e(0xcaca20eaafea8faf), C64e(0xf4f47d898e89f38e), + C64e(0x47476720e9208ee9), C64e(0x1010382818282018), + C64e(0x6f6f0b64d564ded5), C64e(0xf0f073838883fb88), + C64e(0x4a4afbb16fb1946f), C64e(0x5c5cca967296b872), + C64e(0x3838546c246c7024), C64e(0x57575f08f108aef1), + C64e(0x73732152c752e6c7), C64e(0x979764f351f33551), + C64e(0xcbcbae6523658d23), C64e(0xa1a125847c84597c), + C64e(0xe8e857bf9cbfcb9c), C64e(0x3e3e5d6321637c21), + C64e(0x9696ea7cdd7c37dd), C64e(0x61611e7fdc7fc2dc), + C64e(0x0d0d9c9186911a86), C64e(0x0f0f9b9485941e85), + C64e(0xe0e04bab90abdb90), C64e(0x7c7cbac642c6f842), + C64e(0x71712657c457e2c4), C64e(0xcccc29e5aae583aa), + C64e(0x9090e373d8733bd8), C64e(0x0606090f050f0c05), + C64e(0xf7f7f4030103f501), C64e(0x1c1c2a3612363812), + C64e(0xc2c23cfea3fe9fa3), C64e(0x6a6a8be15fe1d45f), + C64e(0xaeaebe10f91047f9), C64e(0x6969026bd06bd2d0), + C64e(0x1717bfa891a82e91), C64e(0x999971e858e82958), + C64e(0x3a3a536927697427), C64e(0x2727f7d0b9d04eb9), + C64e(0xd9d991483848a938), C64e(0xebebde351335cd13), + C64e(0x2b2be5ceb3ce56b3), C64e(0x2222775533554433), + C64e(0xd2d204d6bbd6bfbb), C64e(0xa9a9399070904970), + C64e(0x0707878089800e89), C64e(0x3333c1f2a7f266a7), + C64e(0x2d2decc1b6c15ab6), C64e(0x3c3c5a6622667822), + C64e(0x1515b8ad92ad2a92), C64e(0xc9c9a96020608920), + C64e(0x87875cdb49db1549), C64e(0xaaaab01aff1a4fff), + C64e(0x5050d8887888a078), C64e(0xa5a52b8e7a8e517a), + C64e(0x0303898a8f8a068f), C64e(0x59594a13f813b2f8), + C64e(0x0909929b809b1280), C64e(0x1a1a233917393417), + C64e(0x65651075da75cada), C64e(0xd7d784533153b531), + C64e(0x8484d551c65113c6), C64e(0xd0d003d3b8d3bbb8), + C64e(0x8282dc5ec35e1fc3), C64e(0x2929e2cbb0cb52b0), + C64e(0x5a5ac3997799b477), C64e(0x1e1e2d3311333c11), + C64e(0x7b7b3d46cb46f6cb), C64e(0xa8a8b71ffc1f4bfc), + C64e(0x6d6d0c61d661dad6), C64e(0x2c2c624e3a4e583a) +}; + +static const sph_u64 T2[] = { + C64e(0xa5c6c632f4a5f497), C64e(0x84f8f86f978497eb), + C64e(0x99eeee5eb099b0c7), C64e(0x8df6f67a8c8d8cf7), + C64e(0x0dffffe8170d17e5), C64e(0xbdd6d60adcbddcb7), + C64e(0xb1dede16c8b1c8a7), C64e(0x5491916dfc54fc39), + C64e(0x50606090f050f0c0), C64e(0x0302020705030504), + C64e(0xa9cece2ee0a9e087), C64e(0x7d5656d1877d87ac), + C64e(0x19e7e7cc2b192bd5), C64e(0x62b5b513a662a671), + C64e(0xe64d4d7c31e6319a), C64e(0x9aecec59b59ab5c3), + C64e(0x458f8f40cf45cf05), C64e(0x9d1f1fa3bc9dbc3e), + C64e(0x40898949c040c009), C64e(0x87fafa68928792ef), + C64e(0x15efefd03f153fc5), C64e(0xebb2b29426eb267f), + C64e(0xc98e8ece40c94007), C64e(0x0bfbfbe61d0b1ded), + C64e(0xec41416e2fec2f82), C64e(0x67b3b31aa967a97d), + C64e(0xfd5f5f431cfd1cbe), C64e(0xea45456025ea258a), + C64e(0xbf2323f9dabfda46), C64e(0xf753535102f702a6), + C64e(0x96e4e445a196a1d3), C64e(0x5b9b9b76ed5bed2d), + C64e(0xc27575285dc25dea), C64e(0x1ce1e1c5241c24d9), + C64e(0xae3d3dd4e9aee97a), C64e(0x6a4c4cf2be6abe98), + C64e(0x5a6c6c82ee5aeed8), C64e(0x417e7ebdc341c3fc), + C64e(0x02f5f5f3060206f1), C64e(0x4f838352d14fd11d), + C64e(0x5c68688ce45ce4d0), C64e(0xf451515607f407a2), + C64e(0x34d1d18d5c345cb9), C64e(0x08f9f9e1180818e9), + C64e(0x93e2e24cae93aedf), C64e(0x73abab3e9573954d), + C64e(0x53626297f553f5c4), C64e(0x3f2a2a6b413f4154), + C64e(0x0c08081c140c1410), C64e(0x52959563f652f631), + C64e(0x654646e9af65af8c), C64e(0x5e9d9d7fe25ee221), + C64e(0x2830304878287860), C64e(0xa13737cff8a1f86e), + C64e(0x0f0a0a1b110f1114), C64e(0xb52f2febc4b5c45e), + C64e(0x090e0e151b091b1c), C64e(0x3624247e5a365a48), + C64e(0x9b1b1badb69bb636), C64e(0x3ddfdf98473d47a5), + C64e(0x26cdcda76a266a81), C64e(0x694e4ef5bb69bb9c), + C64e(0xcd7f7f334ccd4cfe), C64e(0x9feaea50ba9fbacf), + C64e(0x1b12123f2d1b2d24), C64e(0x9e1d1da4b99eb93a), + C64e(0x745858c49c749cb0), C64e(0x2e343446722e7268), + C64e(0x2d363641772d776c), C64e(0xb2dcdc11cdb2cda3), + C64e(0xeeb4b49d29ee2973), C64e(0xfb5b5b4d16fb16b6), + C64e(0xf6a4a4a501f60153), C64e(0x4d7676a1d74dd7ec), + C64e(0x61b7b714a361a375), C64e(0xce7d7d3449ce49fa), + C64e(0x7b5252df8d7b8da4), C64e(0x3edddd9f423e42a1), + C64e(0x715e5ecd937193bc), C64e(0x971313b1a297a226), + C64e(0xf5a6a6a204f50457), C64e(0x68b9b901b868b869), + C64e(0x0000000000000000), C64e(0x2cc1c1b5742c7499), + C64e(0x604040e0a060a080), C64e(0x1fe3e3c2211f21dd), + C64e(0xc879793a43c843f2), C64e(0xedb6b69a2ced2c77), + C64e(0xbed4d40dd9bed9b3), C64e(0x468d8d47ca46ca01), + C64e(0xd967671770d970ce), C64e(0x4b7272afdd4bdde4), + C64e(0xde9494ed79de7933), C64e(0xd49898ff67d4672b), + C64e(0xe8b0b09323e8237b), C64e(0x4a85855bde4ade11), + C64e(0x6bbbbb06bd6bbd6d), C64e(0x2ac5c5bb7e2a7e91), + C64e(0xe54f4f7b34e5349e), C64e(0x16ededd73a163ac1), + C64e(0xc58686d254c55417), C64e(0xd79a9af862d7622f), + C64e(0x55666699ff55ffcc), C64e(0x941111b6a794a722), + C64e(0xcf8a8ac04acf4a0f), C64e(0x10e9e9d9301030c9), + C64e(0x0604040e0a060a08), C64e(0x81fefe66988198e7), + C64e(0xf0a0a0ab0bf00b5b), C64e(0x447878b4cc44ccf0), + C64e(0xba2525f0d5bad54a), C64e(0xe34b4b753ee33e96), + C64e(0xf3a2a2ac0ef30e5f), C64e(0xfe5d5d4419fe19ba), + C64e(0xc08080db5bc05b1b), C64e(0x8a050580858a850a), + C64e(0xad3f3fd3ecadec7e), C64e(0xbc2121fedfbcdf42), + C64e(0x487070a8d848d8e0), C64e(0x04f1f1fd0c040cf9), + C64e(0xdf6363197adf7ac6), C64e(0xc177772f58c158ee), + C64e(0x75afaf309f759f45), C64e(0x634242e7a563a584), + C64e(0x3020207050305040), C64e(0x1ae5e5cb2e1a2ed1), + C64e(0x0efdfdef120e12e1), C64e(0x6dbfbf08b76db765), + C64e(0x4c818155d44cd419), C64e(0x141818243c143c30), + C64e(0x352626795f355f4c), C64e(0x2fc3c3b2712f719d), + C64e(0xe1bebe8638e13867), C64e(0xa23535c8fda2fd6a), + C64e(0xcc8888c74fcc4f0b), C64e(0x392e2e654b394b5c), + C64e(0x5793936af957f93d), C64e(0xf25555580df20daa), + C64e(0x82fcfc619d829de3), C64e(0x477a7ab3c947c9f4), + C64e(0xacc8c827efacef8b), C64e(0xe7baba8832e7326f), + C64e(0x2b32324f7d2b7d64), C64e(0x95e6e642a495a4d7), + C64e(0xa0c0c03bfba0fb9b), C64e(0x981919aab398b332), + C64e(0xd19e9ef668d16827), C64e(0x7fa3a322817f815d), + C64e(0x664444eeaa66aa88), C64e(0x7e5454d6827e82a8), + C64e(0xab3b3bdde6abe676), C64e(0x830b0b959e839e16), + C64e(0xca8c8cc945ca4503), C64e(0x29c7c7bc7b297b95), + C64e(0xd36b6b056ed36ed6), C64e(0x3c28286c443c4450), + C64e(0x79a7a72c8b798b55), C64e(0xe2bcbc813de23d63), + C64e(0x1d161631271d272c), C64e(0x76adad379a769a41), + C64e(0x3bdbdb964d3b4dad), C64e(0x5664649efa56fac8), + C64e(0x4e7474a6d24ed2e8), C64e(0x1e141436221e2228), + C64e(0xdb9292e476db763f), C64e(0x0a0c0c121e0a1e18), + C64e(0x6c4848fcb46cb490), C64e(0xe4b8b88f37e4376b), + C64e(0x5d9f9f78e75de725), C64e(0x6ebdbd0fb26eb261), + C64e(0xef4343692aef2a86), C64e(0xa6c4c435f1a6f193), + C64e(0xa83939dae3a8e372), C64e(0xa43131c6f7a4f762), + C64e(0x37d3d38a593759bd), C64e(0x8bf2f274868b86ff), + C64e(0x32d5d583563256b1), C64e(0x438b8b4ec543c50d), + C64e(0x596e6e85eb59ebdc), C64e(0xb7dada18c2b7c2af), + C64e(0x8c01018e8f8c8f02), C64e(0x64b1b11dac64ac79), + C64e(0xd29c9cf16dd26d23), C64e(0xe04949723be03b92), + C64e(0xb4d8d81fc7b4c7ab), C64e(0xfaacacb915fa1543), + C64e(0x07f3f3fa090709fd), C64e(0x25cfcfa06f256f85), + C64e(0xafcaca20eaafea8f), C64e(0x8ef4f47d898e89f3), + C64e(0xe947476720e9208e), C64e(0x1810103828182820), + C64e(0xd56f6f0b64d564de), C64e(0x88f0f073838883fb), + C64e(0x6f4a4afbb16fb194), C64e(0x725c5cca967296b8), + C64e(0x243838546c246c70), C64e(0xf157575f08f108ae), + C64e(0xc773732152c752e6), C64e(0x51979764f351f335), + C64e(0x23cbcbae6523658d), C64e(0x7ca1a125847c8459), + C64e(0x9ce8e857bf9cbfcb), C64e(0x213e3e5d6321637c), + C64e(0xdd9696ea7cdd7c37), C64e(0xdc61611e7fdc7fc2), + C64e(0x860d0d9c9186911a), C64e(0x850f0f9b9485941e), + C64e(0x90e0e04bab90abdb), C64e(0x427c7cbac642c6f8), + C64e(0xc471712657c457e2), C64e(0xaacccc29e5aae583), + C64e(0xd89090e373d8733b), C64e(0x050606090f050f0c), + C64e(0x01f7f7f4030103f5), C64e(0x121c1c2a36123638), + C64e(0xa3c2c23cfea3fe9f), C64e(0x5f6a6a8be15fe1d4), + C64e(0xf9aeaebe10f91047), C64e(0xd06969026bd06bd2), + C64e(0x911717bfa891a82e), C64e(0x58999971e858e829), + C64e(0x273a3a5369276974), C64e(0xb92727f7d0b9d04e), + C64e(0x38d9d991483848a9), C64e(0x13ebebde351335cd), + C64e(0xb32b2be5ceb3ce56), C64e(0x3322227755335544), + C64e(0xbbd2d204d6bbd6bf), C64e(0x70a9a93990709049), + C64e(0x890707878089800e), C64e(0xa73333c1f2a7f266), + C64e(0xb62d2decc1b6c15a), C64e(0x223c3c5a66226678), + C64e(0x921515b8ad92ad2a), C64e(0x20c9c9a960206089), + C64e(0x4987875cdb49db15), C64e(0xffaaaab01aff1a4f), + C64e(0x785050d8887888a0), C64e(0x7aa5a52b8e7a8e51), + C64e(0x8f0303898a8f8a06), C64e(0xf859594a13f813b2), + C64e(0x800909929b809b12), C64e(0x171a1a2339173934), + C64e(0xda65651075da75ca), C64e(0x31d7d784533153b5), + C64e(0xc68484d551c65113), C64e(0xb8d0d003d3b8d3bb), + C64e(0xc38282dc5ec35e1f), C64e(0xb02929e2cbb0cb52), + C64e(0x775a5ac3997799b4), C64e(0x111e1e2d3311333c), + C64e(0xcb7b7b3d46cb46f6), C64e(0xfca8a8b71ffc1f4b), + C64e(0xd66d6d0c61d661da), C64e(0x3a2c2c624e3a4e58) +}; + +static const sph_u64 T3[] = { + C64e(0x97a5c6c632f4a5f4), C64e(0xeb84f8f86f978497), + C64e(0xc799eeee5eb099b0), C64e(0xf78df6f67a8c8d8c), + C64e(0xe50dffffe8170d17), C64e(0xb7bdd6d60adcbddc), + C64e(0xa7b1dede16c8b1c8), C64e(0x395491916dfc54fc), + C64e(0xc050606090f050f0), C64e(0x0403020207050305), + C64e(0x87a9cece2ee0a9e0), C64e(0xac7d5656d1877d87), + C64e(0xd519e7e7cc2b192b), C64e(0x7162b5b513a662a6), + C64e(0x9ae64d4d7c31e631), C64e(0xc39aecec59b59ab5), + C64e(0x05458f8f40cf45cf), C64e(0x3e9d1f1fa3bc9dbc), + C64e(0x0940898949c040c0), C64e(0xef87fafa68928792), + C64e(0xc515efefd03f153f), C64e(0x7febb2b29426eb26), + C64e(0x07c98e8ece40c940), C64e(0xed0bfbfbe61d0b1d), + C64e(0x82ec41416e2fec2f), C64e(0x7d67b3b31aa967a9), + C64e(0xbefd5f5f431cfd1c), C64e(0x8aea45456025ea25), + C64e(0x46bf2323f9dabfda), C64e(0xa6f753535102f702), + C64e(0xd396e4e445a196a1), C64e(0x2d5b9b9b76ed5bed), + C64e(0xeac27575285dc25d), C64e(0xd91ce1e1c5241c24), + C64e(0x7aae3d3dd4e9aee9), C64e(0x986a4c4cf2be6abe), + C64e(0xd85a6c6c82ee5aee), C64e(0xfc417e7ebdc341c3), + C64e(0xf102f5f5f3060206), C64e(0x1d4f838352d14fd1), + C64e(0xd05c68688ce45ce4), C64e(0xa2f451515607f407), + C64e(0xb934d1d18d5c345c), C64e(0xe908f9f9e1180818), + C64e(0xdf93e2e24cae93ae), C64e(0x4d73abab3e957395), + C64e(0xc453626297f553f5), C64e(0x543f2a2a6b413f41), + C64e(0x100c08081c140c14), C64e(0x3152959563f652f6), + C64e(0x8c654646e9af65af), C64e(0x215e9d9d7fe25ee2), + C64e(0x6028303048782878), C64e(0x6ea13737cff8a1f8), + C64e(0x140f0a0a1b110f11), C64e(0x5eb52f2febc4b5c4), + C64e(0x1c090e0e151b091b), C64e(0x483624247e5a365a), + C64e(0x369b1b1badb69bb6), C64e(0xa53ddfdf98473d47), + C64e(0x8126cdcda76a266a), C64e(0x9c694e4ef5bb69bb), + C64e(0xfecd7f7f334ccd4c), C64e(0xcf9feaea50ba9fba), + C64e(0x241b12123f2d1b2d), C64e(0x3a9e1d1da4b99eb9), + C64e(0xb0745858c49c749c), C64e(0x682e343446722e72), + C64e(0x6c2d363641772d77), C64e(0xa3b2dcdc11cdb2cd), + C64e(0x73eeb4b49d29ee29), C64e(0xb6fb5b5b4d16fb16), + C64e(0x53f6a4a4a501f601), C64e(0xec4d7676a1d74dd7), + C64e(0x7561b7b714a361a3), C64e(0xface7d7d3449ce49), + C64e(0xa47b5252df8d7b8d), C64e(0xa13edddd9f423e42), + C64e(0xbc715e5ecd937193), C64e(0x26971313b1a297a2), + C64e(0x57f5a6a6a204f504), C64e(0x6968b9b901b868b8), + C64e(0x0000000000000000), C64e(0x992cc1c1b5742c74), + C64e(0x80604040e0a060a0), C64e(0xdd1fe3e3c2211f21), + C64e(0xf2c879793a43c843), C64e(0x77edb6b69a2ced2c), + C64e(0xb3bed4d40dd9bed9), C64e(0x01468d8d47ca46ca), + C64e(0xced967671770d970), C64e(0xe44b7272afdd4bdd), + C64e(0x33de9494ed79de79), C64e(0x2bd49898ff67d467), + C64e(0x7be8b0b09323e823), C64e(0x114a85855bde4ade), + C64e(0x6d6bbbbb06bd6bbd), C64e(0x912ac5c5bb7e2a7e), + C64e(0x9ee54f4f7b34e534), C64e(0xc116ededd73a163a), + C64e(0x17c58686d254c554), C64e(0x2fd79a9af862d762), + C64e(0xcc55666699ff55ff), C64e(0x22941111b6a794a7), + C64e(0x0fcf8a8ac04acf4a), C64e(0xc910e9e9d9301030), + C64e(0x080604040e0a060a), C64e(0xe781fefe66988198), + C64e(0x5bf0a0a0ab0bf00b), C64e(0xf0447878b4cc44cc), + C64e(0x4aba2525f0d5bad5), C64e(0x96e34b4b753ee33e), + C64e(0x5ff3a2a2ac0ef30e), C64e(0xbafe5d5d4419fe19), + C64e(0x1bc08080db5bc05b), C64e(0x0a8a050580858a85), + C64e(0x7ead3f3fd3ecadec), C64e(0x42bc2121fedfbcdf), + C64e(0xe0487070a8d848d8), C64e(0xf904f1f1fd0c040c), + C64e(0xc6df6363197adf7a), C64e(0xeec177772f58c158), + C64e(0x4575afaf309f759f), C64e(0x84634242e7a563a5), + C64e(0x4030202070503050), C64e(0xd11ae5e5cb2e1a2e), + C64e(0xe10efdfdef120e12), C64e(0x656dbfbf08b76db7), + C64e(0x194c818155d44cd4), C64e(0x30141818243c143c), + C64e(0x4c352626795f355f), C64e(0x9d2fc3c3b2712f71), + C64e(0x67e1bebe8638e138), C64e(0x6aa23535c8fda2fd), + C64e(0x0bcc8888c74fcc4f), C64e(0x5c392e2e654b394b), + C64e(0x3d5793936af957f9), C64e(0xaaf25555580df20d), + C64e(0xe382fcfc619d829d), C64e(0xf4477a7ab3c947c9), + C64e(0x8bacc8c827efacef), C64e(0x6fe7baba8832e732), + C64e(0x642b32324f7d2b7d), C64e(0xd795e6e642a495a4), + C64e(0x9ba0c0c03bfba0fb), C64e(0x32981919aab398b3), + C64e(0x27d19e9ef668d168), C64e(0x5d7fa3a322817f81), + C64e(0x88664444eeaa66aa), C64e(0xa87e5454d6827e82), + C64e(0x76ab3b3bdde6abe6), C64e(0x16830b0b959e839e), + C64e(0x03ca8c8cc945ca45), C64e(0x9529c7c7bc7b297b), + C64e(0xd6d36b6b056ed36e), C64e(0x503c28286c443c44), + C64e(0x5579a7a72c8b798b), C64e(0x63e2bcbc813de23d), + C64e(0x2c1d161631271d27), C64e(0x4176adad379a769a), + C64e(0xad3bdbdb964d3b4d), C64e(0xc85664649efa56fa), + C64e(0xe84e7474a6d24ed2), C64e(0x281e141436221e22), + C64e(0x3fdb9292e476db76), C64e(0x180a0c0c121e0a1e), + C64e(0x906c4848fcb46cb4), C64e(0x6be4b8b88f37e437), + C64e(0x255d9f9f78e75de7), C64e(0x616ebdbd0fb26eb2), + C64e(0x86ef4343692aef2a), C64e(0x93a6c4c435f1a6f1), + C64e(0x72a83939dae3a8e3), C64e(0x62a43131c6f7a4f7), + C64e(0xbd37d3d38a593759), C64e(0xff8bf2f274868b86), + C64e(0xb132d5d583563256), C64e(0x0d438b8b4ec543c5), + C64e(0xdc596e6e85eb59eb), C64e(0xafb7dada18c2b7c2), + C64e(0x028c01018e8f8c8f), C64e(0x7964b1b11dac64ac), + C64e(0x23d29c9cf16dd26d), C64e(0x92e04949723be03b), + C64e(0xabb4d8d81fc7b4c7), C64e(0x43faacacb915fa15), + C64e(0xfd07f3f3fa090709), C64e(0x8525cfcfa06f256f), + C64e(0x8fafcaca20eaafea), C64e(0xf38ef4f47d898e89), + C64e(0x8ee947476720e920), C64e(0x2018101038281828), + C64e(0xded56f6f0b64d564), C64e(0xfb88f0f073838883), + C64e(0x946f4a4afbb16fb1), C64e(0xb8725c5cca967296), + C64e(0x70243838546c246c), C64e(0xaef157575f08f108), + C64e(0xe6c773732152c752), C64e(0x3551979764f351f3), + C64e(0x8d23cbcbae652365), C64e(0x597ca1a125847c84), + C64e(0xcb9ce8e857bf9cbf), C64e(0x7c213e3e5d632163), + C64e(0x37dd9696ea7cdd7c), C64e(0xc2dc61611e7fdc7f), + C64e(0x1a860d0d9c918691), C64e(0x1e850f0f9b948594), + C64e(0xdb90e0e04bab90ab), C64e(0xf8427c7cbac642c6), + C64e(0xe2c471712657c457), C64e(0x83aacccc29e5aae5), + C64e(0x3bd89090e373d873), C64e(0x0c050606090f050f), + C64e(0xf501f7f7f4030103), C64e(0x38121c1c2a361236), + C64e(0x9fa3c2c23cfea3fe), C64e(0xd45f6a6a8be15fe1), + C64e(0x47f9aeaebe10f910), C64e(0xd2d06969026bd06b), + C64e(0x2e911717bfa891a8), C64e(0x2958999971e858e8), + C64e(0x74273a3a53692769), C64e(0x4eb92727f7d0b9d0), + C64e(0xa938d9d991483848), C64e(0xcd13ebebde351335), + C64e(0x56b32b2be5ceb3ce), C64e(0x4433222277553355), + C64e(0xbfbbd2d204d6bbd6), C64e(0x4970a9a939907090), + C64e(0x0e89070787808980), C64e(0x66a73333c1f2a7f2), + C64e(0x5ab62d2decc1b6c1), C64e(0x78223c3c5a662266), + C64e(0x2a921515b8ad92ad), C64e(0x8920c9c9a9602060), + C64e(0x154987875cdb49db), C64e(0x4fffaaaab01aff1a), + C64e(0xa0785050d8887888), C64e(0x517aa5a52b8e7a8e), + C64e(0x068f0303898a8f8a), C64e(0xb2f859594a13f813), + C64e(0x12800909929b809b), C64e(0x34171a1a23391739), + C64e(0xcada65651075da75), C64e(0xb531d7d784533153), + C64e(0x13c68484d551c651), C64e(0xbbb8d0d003d3b8d3), + C64e(0x1fc38282dc5ec35e), C64e(0x52b02929e2cbb0cb), + C64e(0xb4775a5ac3997799), C64e(0x3c111e1e2d331133), + C64e(0xf6cb7b7b3d46cb46), C64e(0x4bfca8a8b71ffc1f), + C64e(0xdad66d6d0c61d661), C64e(0x583a2c2c624e3a4e) +}; + +#endif + +static const sph_u64 T4[] = { + C64e(0xf497a5c6c632f4a5), C64e(0x97eb84f8f86f9784), + C64e(0xb0c799eeee5eb099), C64e(0x8cf78df6f67a8c8d), + C64e(0x17e50dffffe8170d), C64e(0xdcb7bdd6d60adcbd), + C64e(0xc8a7b1dede16c8b1), C64e(0xfc395491916dfc54), + C64e(0xf0c050606090f050), C64e(0x0504030202070503), + C64e(0xe087a9cece2ee0a9), C64e(0x87ac7d5656d1877d), + C64e(0x2bd519e7e7cc2b19), C64e(0xa67162b5b513a662), + C64e(0x319ae64d4d7c31e6), C64e(0xb5c39aecec59b59a), + C64e(0xcf05458f8f40cf45), C64e(0xbc3e9d1f1fa3bc9d), + C64e(0xc00940898949c040), C64e(0x92ef87fafa689287), + C64e(0x3fc515efefd03f15), C64e(0x267febb2b29426eb), + C64e(0x4007c98e8ece40c9), C64e(0x1ded0bfbfbe61d0b), + C64e(0x2f82ec41416e2fec), C64e(0xa97d67b3b31aa967), + C64e(0x1cbefd5f5f431cfd), C64e(0x258aea45456025ea), + C64e(0xda46bf2323f9dabf), C64e(0x02a6f753535102f7), + C64e(0xa1d396e4e445a196), C64e(0xed2d5b9b9b76ed5b), + C64e(0x5deac27575285dc2), C64e(0x24d91ce1e1c5241c), + C64e(0xe97aae3d3dd4e9ae), C64e(0xbe986a4c4cf2be6a), + C64e(0xeed85a6c6c82ee5a), C64e(0xc3fc417e7ebdc341), + C64e(0x06f102f5f5f30602), C64e(0xd11d4f838352d14f), + C64e(0xe4d05c68688ce45c), C64e(0x07a2f451515607f4), + C64e(0x5cb934d1d18d5c34), C64e(0x18e908f9f9e11808), + C64e(0xaedf93e2e24cae93), C64e(0x954d73abab3e9573), + C64e(0xf5c453626297f553), C64e(0x41543f2a2a6b413f), + C64e(0x14100c08081c140c), C64e(0xf63152959563f652), + C64e(0xaf8c654646e9af65), C64e(0xe2215e9d9d7fe25e), + C64e(0x7860283030487828), C64e(0xf86ea13737cff8a1), + C64e(0x11140f0a0a1b110f), C64e(0xc45eb52f2febc4b5), + C64e(0x1b1c090e0e151b09), C64e(0x5a483624247e5a36), + C64e(0xb6369b1b1badb69b), C64e(0x47a53ddfdf98473d), + C64e(0x6a8126cdcda76a26), C64e(0xbb9c694e4ef5bb69), + C64e(0x4cfecd7f7f334ccd), C64e(0xbacf9feaea50ba9f), + C64e(0x2d241b12123f2d1b), C64e(0xb93a9e1d1da4b99e), + C64e(0x9cb0745858c49c74), C64e(0x72682e343446722e), + C64e(0x776c2d363641772d), C64e(0xcda3b2dcdc11cdb2), + C64e(0x2973eeb4b49d29ee), C64e(0x16b6fb5b5b4d16fb), + C64e(0x0153f6a4a4a501f6), C64e(0xd7ec4d7676a1d74d), + C64e(0xa37561b7b714a361), C64e(0x49face7d7d3449ce), + C64e(0x8da47b5252df8d7b), C64e(0x42a13edddd9f423e), + C64e(0x93bc715e5ecd9371), C64e(0xa226971313b1a297), + C64e(0x0457f5a6a6a204f5), C64e(0xb86968b9b901b868), + C64e(0x0000000000000000), C64e(0x74992cc1c1b5742c), + C64e(0xa080604040e0a060), C64e(0x21dd1fe3e3c2211f), + C64e(0x43f2c879793a43c8), C64e(0x2c77edb6b69a2ced), + C64e(0xd9b3bed4d40dd9be), C64e(0xca01468d8d47ca46), + C64e(0x70ced967671770d9), C64e(0xdde44b7272afdd4b), + C64e(0x7933de9494ed79de), C64e(0x672bd49898ff67d4), + C64e(0x237be8b0b09323e8), C64e(0xde114a85855bde4a), + C64e(0xbd6d6bbbbb06bd6b), C64e(0x7e912ac5c5bb7e2a), + C64e(0x349ee54f4f7b34e5), C64e(0x3ac116ededd73a16), + C64e(0x5417c58686d254c5), C64e(0x622fd79a9af862d7), + C64e(0xffcc55666699ff55), C64e(0xa722941111b6a794), + C64e(0x4a0fcf8a8ac04acf), C64e(0x30c910e9e9d93010), + C64e(0x0a080604040e0a06), C64e(0x98e781fefe669881), + C64e(0x0b5bf0a0a0ab0bf0), C64e(0xccf0447878b4cc44), + C64e(0xd54aba2525f0d5ba), C64e(0x3e96e34b4b753ee3), + C64e(0x0e5ff3a2a2ac0ef3), C64e(0x19bafe5d5d4419fe), + C64e(0x5b1bc08080db5bc0), C64e(0x850a8a050580858a), + C64e(0xec7ead3f3fd3ecad), C64e(0xdf42bc2121fedfbc), + C64e(0xd8e0487070a8d848), C64e(0x0cf904f1f1fd0c04), + C64e(0x7ac6df6363197adf), C64e(0x58eec177772f58c1), + C64e(0x9f4575afaf309f75), C64e(0xa584634242e7a563), + C64e(0x5040302020705030), C64e(0x2ed11ae5e5cb2e1a), + C64e(0x12e10efdfdef120e), C64e(0xb7656dbfbf08b76d), + C64e(0xd4194c818155d44c), C64e(0x3c30141818243c14), + C64e(0x5f4c352626795f35), C64e(0x719d2fc3c3b2712f), + C64e(0x3867e1bebe8638e1), C64e(0xfd6aa23535c8fda2), + C64e(0x4f0bcc8888c74fcc), C64e(0x4b5c392e2e654b39), + C64e(0xf93d5793936af957), C64e(0x0daaf25555580df2), + C64e(0x9de382fcfc619d82), C64e(0xc9f4477a7ab3c947), + C64e(0xef8bacc8c827efac), C64e(0x326fe7baba8832e7), + C64e(0x7d642b32324f7d2b), C64e(0xa4d795e6e642a495), + C64e(0xfb9ba0c0c03bfba0), C64e(0xb332981919aab398), + C64e(0x6827d19e9ef668d1), C64e(0x815d7fa3a322817f), + C64e(0xaa88664444eeaa66), C64e(0x82a87e5454d6827e), + C64e(0xe676ab3b3bdde6ab), C64e(0x9e16830b0b959e83), + C64e(0x4503ca8c8cc945ca), C64e(0x7b9529c7c7bc7b29), + C64e(0x6ed6d36b6b056ed3), C64e(0x44503c28286c443c), + C64e(0x8b5579a7a72c8b79), C64e(0x3d63e2bcbc813de2), + C64e(0x272c1d161631271d), C64e(0x9a4176adad379a76), + C64e(0x4dad3bdbdb964d3b), C64e(0xfac85664649efa56), + C64e(0xd2e84e7474a6d24e), C64e(0x22281e141436221e), + C64e(0x763fdb9292e476db), C64e(0x1e180a0c0c121e0a), + C64e(0xb4906c4848fcb46c), C64e(0x376be4b8b88f37e4), + C64e(0xe7255d9f9f78e75d), C64e(0xb2616ebdbd0fb26e), + C64e(0x2a86ef4343692aef), C64e(0xf193a6c4c435f1a6), + C64e(0xe372a83939dae3a8), C64e(0xf762a43131c6f7a4), + C64e(0x59bd37d3d38a5937), C64e(0x86ff8bf2f274868b), + C64e(0x56b132d5d5835632), C64e(0xc50d438b8b4ec543), + C64e(0xebdc596e6e85eb59), C64e(0xc2afb7dada18c2b7), + C64e(0x8f028c01018e8f8c), C64e(0xac7964b1b11dac64), + C64e(0x6d23d29c9cf16dd2), C64e(0x3b92e04949723be0), + C64e(0xc7abb4d8d81fc7b4), C64e(0x1543faacacb915fa), + C64e(0x09fd07f3f3fa0907), C64e(0x6f8525cfcfa06f25), + C64e(0xea8fafcaca20eaaf), C64e(0x89f38ef4f47d898e), + C64e(0x208ee947476720e9), C64e(0x2820181010382818), + C64e(0x64ded56f6f0b64d5), C64e(0x83fb88f0f0738388), + C64e(0xb1946f4a4afbb16f), C64e(0x96b8725c5cca9672), + C64e(0x6c70243838546c24), C64e(0x08aef157575f08f1), + C64e(0x52e6c773732152c7), C64e(0xf33551979764f351), + C64e(0x658d23cbcbae6523), C64e(0x84597ca1a125847c), + C64e(0xbfcb9ce8e857bf9c), C64e(0x637c213e3e5d6321), + C64e(0x7c37dd9696ea7cdd), C64e(0x7fc2dc61611e7fdc), + C64e(0x911a860d0d9c9186), C64e(0x941e850f0f9b9485), + C64e(0xabdb90e0e04bab90), C64e(0xc6f8427c7cbac642), + C64e(0x57e2c471712657c4), C64e(0xe583aacccc29e5aa), + C64e(0x733bd89090e373d8), C64e(0x0f0c050606090f05), + C64e(0x03f501f7f7f40301), C64e(0x3638121c1c2a3612), + C64e(0xfe9fa3c2c23cfea3), C64e(0xe1d45f6a6a8be15f), + C64e(0x1047f9aeaebe10f9), C64e(0x6bd2d06969026bd0), + C64e(0xa82e911717bfa891), C64e(0xe82958999971e858), + C64e(0x6974273a3a536927), C64e(0xd04eb92727f7d0b9), + C64e(0x48a938d9d9914838), C64e(0x35cd13ebebde3513), + C64e(0xce56b32b2be5ceb3), C64e(0x5544332222775533), + C64e(0xd6bfbbd2d204d6bb), C64e(0x904970a9a9399070), + C64e(0x800e890707878089), C64e(0xf266a73333c1f2a7), + C64e(0xc15ab62d2decc1b6), C64e(0x6678223c3c5a6622), + C64e(0xad2a921515b8ad92), C64e(0x608920c9c9a96020), + C64e(0xdb154987875cdb49), C64e(0x1a4fffaaaab01aff), + C64e(0x88a0785050d88878), C64e(0x8e517aa5a52b8e7a), + C64e(0x8a068f0303898a8f), C64e(0x13b2f859594a13f8), + C64e(0x9b12800909929b80), C64e(0x3934171a1a233917), + C64e(0x75cada65651075da), C64e(0x53b531d7d7845331), + C64e(0x5113c68484d551c6), C64e(0xd3bbb8d0d003d3b8), + C64e(0x5e1fc38282dc5ec3), C64e(0xcb52b02929e2cbb0), + C64e(0x99b4775a5ac39977), C64e(0x333c111e1e2d3311), + C64e(0x46f6cb7b7b3d46cb), C64e(0x1f4bfca8a8b71ffc), + C64e(0x61dad66d6d0c61d6), C64e(0x4e583a2c2c624e3a) +}; + +#if !SPH_SMALL_FOOTPRINT_GROESTL + +static const sph_u64 T5[] = { + C64e(0xa5f497a5c6c632f4), C64e(0x8497eb84f8f86f97), + C64e(0x99b0c799eeee5eb0), C64e(0x8d8cf78df6f67a8c), + C64e(0x0d17e50dffffe817), C64e(0xbddcb7bdd6d60adc), + C64e(0xb1c8a7b1dede16c8), C64e(0x54fc395491916dfc), + C64e(0x50f0c050606090f0), C64e(0x0305040302020705), + C64e(0xa9e087a9cece2ee0), C64e(0x7d87ac7d5656d187), + C64e(0x192bd519e7e7cc2b), C64e(0x62a67162b5b513a6), + C64e(0xe6319ae64d4d7c31), C64e(0x9ab5c39aecec59b5), + C64e(0x45cf05458f8f40cf), C64e(0x9dbc3e9d1f1fa3bc), + C64e(0x40c00940898949c0), C64e(0x8792ef87fafa6892), + C64e(0x153fc515efefd03f), C64e(0xeb267febb2b29426), + C64e(0xc94007c98e8ece40), C64e(0x0b1ded0bfbfbe61d), + C64e(0xec2f82ec41416e2f), C64e(0x67a97d67b3b31aa9), + C64e(0xfd1cbefd5f5f431c), C64e(0xea258aea45456025), + C64e(0xbfda46bf2323f9da), C64e(0xf702a6f753535102), + C64e(0x96a1d396e4e445a1), C64e(0x5bed2d5b9b9b76ed), + C64e(0xc25deac27575285d), C64e(0x1c24d91ce1e1c524), + C64e(0xaee97aae3d3dd4e9), C64e(0x6abe986a4c4cf2be), + C64e(0x5aeed85a6c6c82ee), C64e(0x41c3fc417e7ebdc3), + C64e(0x0206f102f5f5f306), C64e(0x4fd11d4f838352d1), + C64e(0x5ce4d05c68688ce4), C64e(0xf407a2f451515607), + C64e(0x345cb934d1d18d5c), C64e(0x0818e908f9f9e118), + C64e(0x93aedf93e2e24cae), C64e(0x73954d73abab3e95), + C64e(0x53f5c453626297f5), C64e(0x3f41543f2a2a6b41), + C64e(0x0c14100c08081c14), C64e(0x52f63152959563f6), + C64e(0x65af8c654646e9af), C64e(0x5ee2215e9d9d7fe2), + C64e(0x2878602830304878), C64e(0xa1f86ea13737cff8), + C64e(0x0f11140f0a0a1b11), C64e(0xb5c45eb52f2febc4), + C64e(0x091b1c090e0e151b), C64e(0x365a483624247e5a), + C64e(0x9bb6369b1b1badb6), C64e(0x3d47a53ddfdf9847), + C64e(0x266a8126cdcda76a), C64e(0x69bb9c694e4ef5bb), + C64e(0xcd4cfecd7f7f334c), C64e(0x9fbacf9feaea50ba), + C64e(0x1b2d241b12123f2d), C64e(0x9eb93a9e1d1da4b9), + C64e(0x749cb0745858c49c), C64e(0x2e72682e34344672), + C64e(0x2d776c2d36364177), C64e(0xb2cda3b2dcdc11cd), + C64e(0xee2973eeb4b49d29), C64e(0xfb16b6fb5b5b4d16), + C64e(0xf60153f6a4a4a501), C64e(0x4dd7ec4d7676a1d7), + C64e(0x61a37561b7b714a3), C64e(0xce49face7d7d3449), + C64e(0x7b8da47b5252df8d), C64e(0x3e42a13edddd9f42), + C64e(0x7193bc715e5ecd93), C64e(0x97a226971313b1a2), + C64e(0xf50457f5a6a6a204), C64e(0x68b86968b9b901b8), + C64e(0x0000000000000000), C64e(0x2c74992cc1c1b574), + C64e(0x60a080604040e0a0), C64e(0x1f21dd1fe3e3c221), + C64e(0xc843f2c879793a43), C64e(0xed2c77edb6b69a2c), + C64e(0xbed9b3bed4d40dd9), C64e(0x46ca01468d8d47ca), + C64e(0xd970ced967671770), C64e(0x4bdde44b7272afdd), + C64e(0xde7933de9494ed79), C64e(0xd4672bd49898ff67), + C64e(0xe8237be8b0b09323), C64e(0x4ade114a85855bde), + C64e(0x6bbd6d6bbbbb06bd), C64e(0x2a7e912ac5c5bb7e), + C64e(0xe5349ee54f4f7b34), C64e(0x163ac116ededd73a), + C64e(0xc55417c58686d254), C64e(0xd7622fd79a9af862), + C64e(0x55ffcc55666699ff), C64e(0x94a722941111b6a7), + C64e(0xcf4a0fcf8a8ac04a), C64e(0x1030c910e9e9d930), + C64e(0x060a080604040e0a), C64e(0x8198e781fefe6698), + C64e(0xf00b5bf0a0a0ab0b), C64e(0x44ccf0447878b4cc), + C64e(0xbad54aba2525f0d5), C64e(0xe33e96e34b4b753e), + C64e(0xf30e5ff3a2a2ac0e), C64e(0xfe19bafe5d5d4419), + C64e(0xc05b1bc08080db5b), C64e(0x8a850a8a05058085), + C64e(0xadec7ead3f3fd3ec), C64e(0xbcdf42bc2121fedf), + C64e(0x48d8e0487070a8d8), C64e(0x040cf904f1f1fd0c), + C64e(0xdf7ac6df6363197a), C64e(0xc158eec177772f58), + C64e(0x759f4575afaf309f), C64e(0x63a584634242e7a5), + C64e(0x3050403020207050), C64e(0x1a2ed11ae5e5cb2e), + C64e(0x0e12e10efdfdef12), C64e(0x6db7656dbfbf08b7), + C64e(0x4cd4194c818155d4), C64e(0x143c30141818243c), + C64e(0x355f4c352626795f), C64e(0x2f719d2fc3c3b271), + C64e(0xe13867e1bebe8638), C64e(0xa2fd6aa23535c8fd), + C64e(0xcc4f0bcc8888c74f), C64e(0x394b5c392e2e654b), + C64e(0x57f93d5793936af9), C64e(0xf20daaf25555580d), + C64e(0x829de382fcfc619d), C64e(0x47c9f4477a7ab3c9), + C64e(0xacef8bacc8c827ef), C64e(0xe7326fe7baba8832), + C64e(0x2b7d642b32324f7d), C64e(0x95a4d795e6e642a4), + C64e(0xa0fb9ba0c0c03bfb), C64e(0x98b332981919aab3), + C64e(0xd16827d19e9ef668), C64e(0x7f815d7fa3a32281), + C64e(0x66aa88664444eeaa), C64e(0x7e82a87e5454d682), + C64e(0xabe676ab3b3bdde6), C64e(0x839e16830b0b959e), + C64e(0xca4503ca8c8cc945), C64e(0x297b9529c7c7bc7b), + C64e(0xd36ed6d36b6b056e), C64e(0x3c44503c28286c44), + C64e(0x798b5579a7a72c8b), C64e(0xe23d63e2bcbc813d), + C64e(0x1d272c1d16163127), C64e(0x769a4176adad379a), + C64e(0x3b4dad3bdbdb964d), C64e(0x56fac85664649efa), + C64e(0x4ed2e84e7474a6d2), C64e(0x1e22281e14143622), + C64e(0xdb763fdb9292e476), C64e(0x0a1e180a0c0c121e), + C64e(0x6cb4906c4848fcb4), C64e(0xe4376be4b8b88f37), + C64e(0x5de7255d9f9f78e7), C64e(0x6eb2616ebdbd0fb2), + C64e(0xef2a86ef4343692a), C64e(0xa6f193a6c4c435f1), + C64e(0xa8e372a83939dae3), C64e(0xa4f762a43131c6f7), + C64e(0x3759bd37d3d38a59), C64e(0x8b86ff8bf2f27486), + C64e(0x3256b132d5d58356), C64e(0x43c50d438b8b4ec5), + C64e(0x59ebdc596e6e85eb), C64e(0xb7c2afb7dada18c2), + C64e(0x8c8f028c01018e8f), C64e(0x64ac7964b1b11dac), + C64e(0xd26d23d29c9cf16d), C64e(0xe03b92e04949723b), + C64e(0xb4c7abb4d8d81fc7), C64e(0xfa1543faacacb915), + C64e(0x0709fd07f3f3fa09), C64e(0x256f8525cfcfa06f), + C64e(0xafea8fafcaca20ea), C64e(0x8e89f38ef4f47d89), + C64e(0xe9208ee947476720), C64e(0x1828201810103828), + C64e(0xd564ded56f6f0b64), C64e(0x8883fb88f0f07383), + C64e(0x6fb1946f4a4afbb1), C64e(0x7296b8725c5cca96), + C64e(0x246c70243838546c), C64e(0xf108aef157575f08), + C64e(0xc752e6c773732152), C64e(0x51f33551979764f3), + C64e(0x23658d23cbcbae65), C64e(0x7c84597ca1a12584), + C64e(0x9cbfcb9ce8e857bf), C64e(0x21637c213e3e5d63), + C64e(0xdd7c37dd9696ea7c), C64e(0xdc7fc2dc61611e7f), + C64e(0x86911a860d0d9c91), C64e(0x85941e850f0f9b94), + C64e(0x90abdb90e0e04bab), C64e(0x42c6f8427c7cbac6), + C64e(0xc457e2c471712657), C64e(0xaae583aacccc29e5), + C64e(0xd8733bd89090e373), C64e(0x050f0c050606090f), + C64e(0x0103f501f7f7f403), C64e(0x123638121c1c2a36), + C64e(0xa3fe9fa3c2c23cfe), C64e(0x5fe1d45f6a6a8be1), + C64e(0xf91047f9aeaebe10), C64e(0xd06bd2d06969026b), + C64e(0x91a82e911717bfa8), C64e(0x58e82958999971e8), + C64e(0x276974273a3a5369), C64e(0xb9d04eb92727f7d0), + C64e(0x3848a938d9d99148), C64e(0x1335cd13ebebde35), + C64e(0xb3ce56b32b2be5ce), C64e(0x3355443322227755), + C64e(0xbbd6bfbbd2d204d6), C64e(0x70904970a9a93990), + C64e(0x89800e8907078780), C64e(0xa7f266a73333c1f2), + C64e(0xb6c15ab62d2decc1), C64e(0x226678223c3c5a66), + C64e(0x92ad2a921515b8ad), C64e(0x20608920c9c9a960), + C64e(0x49db154987875cdb), C64e(0xff1a4fffaaaab01a), + C64e(0x7888a0785050d888), C64e(0x7a8e517aa5a52b8e), + C64e(0x8f8a068f0303898a), C64e(0xf813b2f859594a13), + C64e(0x809b12800909929b), C64e(0x173934171a1a2339), + C64e(0xda75cada65651075), C64e(0x3153b531d7d78453), + C64e(0xc65113c68484d551), C64e(0xb8d3bbb8d0d003d3), + C64e(0xc35e1fc38282dc5e), C64e(0xb0cb52b02929e2cb), + C64e(0x7799b4775a5ac399), C64e(0x11333c111e1e2d33), + C64e(0xcb46f6cb7b7b3d46), C64e(0xfc1f4bfca8a8b71f), + C64e(0xd661dad66d6d0c61), C64e(0x3a4e583a2c2c624e) +}; + +static const sph_u64 T6[] = { + C64e(0xf4a5f497a5c6c632), C64e(0x978497eb84f8f86f), + C64e(0xb099b0c799eeee5e), C64e(0x8c8d8cf78df6f67a), + C64e(0x170d17e50dffffe8), C64e(0xdcbddcb7bdd6d60a), + C64e(0xc8b1c8a7b1dede16), C64e(0xfc54fc395491916d), + C64e(0xf050f0c050606090), C64e(0x0503050403020207), + C64e(0xe0a9e087a9cece2e), C64e(0x877d87ac7d5656d1), + C64e(0x2b192bd519e7e7cc), C64e(0xa662a67162b5b513), + C64e(0x31e6319ae64d4d7c), C64e(0xb59ab5c39aecec59), + C64e(0xcf45cf05458f8f40), C64e(0xbc9dbc3e9d1f1fa3), + C64e(0xc040c00940898949), C64e(0x928792ef87fafa68), + C64e(0x3f153fc515efefd0), C64e(0x26eb267febb2b294), + C64e(0x40c94007c98e8ece), C64e(0x1d0b1ded0bfbfbe6), + C64e(0x2fec2f82ec41416e), C64e(0xa967a97d67b3b31a), + C64e(0x1cfd1cbefd5f5f43), C64e(0x25ea258aea454560), + C64e(0xdabfda46bf2323f9), C64e(0x02f702a6f7535351), + C64e(0xa196a1d396e4e445), C64e(0xed5bed2d5b9b9b76), + C64e(0x5dc25deac2757528), C64e(0x241c24d91ce1e1c5), + C64e(0xe9aee97aae3d3dd4), C64e(0xbe6abe986a4c4cf2), + C64e(0xee5aeed85a6c6c82), C64e(0xc341c3fc417e7ebd), + C64e(0x060206f102f5f5f3), C64e(0xd14fd11d4f838352), + C64e(0xe45ce4d05c68688c), C64e(0x07f407a2f4515156), + C64e(0x5c345cb934d1d18d), C64e(0x180818e908f9f9e1), + C64e(0xae93aedf93e2e24c), C64e(0x9573954d73abab3e), + C64e(0xf553f5c453626297), C64e(0x413f41543f2a2a6b), + C64e(0x140c14100c08081c), C64e(0xf652f63152959563), + C64e(0xaf65af8c654646e9), C64e(0xe25ee2215e9d9d7f), + C64e(0x7828786028303048), C64e(0xf8a1f86ea13737cf), + C64e(0x110f11140f0a0a1b), C64e(0xc4b5c45eb52f2feb), + C64e(0x1b091b1c090e0e15), C64e(0x5a365a483624247e), + C64e(0xb69bb6369b1b1bad), C64e(0x473d47a53ddfdf98), + C64e(0x6a266a8126cdcda7), C64e(0xbb69bb9c694e4ef5), + C64e(0x4ccd4cfecd7f7f33), C64e(0xba9fbacf9feaea50), + C64e(0x2d1b2d241b12123f), C64e(0xb99eb93a9e1d1da4), + C64e(0x9c749cb0745858c4), C64e(0x722e72682e343446), + C64e(0x772d776c2d363641), C64e(0xcdb2cda3b2dcdc11), + C64e(0x29ee2973eeb4b49d), C64e(0x16fb16b6fb5b5b4d), + C64e(0x01f60153f6a4a4a5), C64e(0xd74dd7ec4d7676a1), + C64e(0xa361a37561b7b714), C64e(0x49ce49face7d7d34), + C64e(0x8d7b8da47b5252df), C64e(0x423e42a13edddd9f), + C64e(0x937193bc715e5ecd), C64e(0xa297a226971313b1), + C64e(0x04f50457f5a6a6a2), C64e(0xb868b86968b9b901), + C64e(0x0000000000000000), C64e(0x742c74992cc1c1b5), + C64e(0xa060a080604040e0), C64e(0x211f21dd1fe3e3c2), + C64e(0x43c843f2c879793a), C64e(0x2ced2c77edb6b69a), + C64e(0xd9bed9b3bed4d40d), C64e(0xca46ca01468d8d47), + C64e(0x70d970ced9676717), C64e(0xdd4bdde44b7272af), + C64e(0x79de7933de9494ed), C64e(0x67d4672bd49898ff), + C64e(0x23e8237be8b0b093), C64e(0xde4ade114a85855b), + C64e(0xbd6bbd6d6bbbbb06), C64e(0x7e2a7e912ac5c5bb), + C64e(0x34e5349ee54f4f7b), C64e(0x3a163ac116ededd7), + C64e(0x54c55417c58686d2), C64e(0x62d7622fd79a9af8), + C64e(0xff55ffcc55666699), C64e(0xa794a722941111b6), + C64e(0x4acf4a0fcf8a8ac0), C64e(0x301030c910e9e9d9), + C64e(0x0a060a080604040e), C64e(0x988198e781fefe66), + C64e(0x0bf00b5bf0a0a0ab), C64e(0xcc44ccf0447878b4), + C64e(0xd5bad54aba2525f0), C64e(0x3ee33e96e34b4b75), + C64e(0x0ef30e5ff3a2a2ac), C64e(0x19fe19bafe5d5d44), + C64e(0x5bc05b1bc08080db), C64e(0x858a850a8a050580), + C64e(0xecadec7ead3f3fd3), C64e(0xdfbcdf42bc2121fe), + C64e(0xd848d8e0487070a8), C64e(0x0c040cf904f1f1fd), + C64e(0x7adf7ac6df636319), C64e(0x58c158eec177772f), + C64e(0x9f759f4575afaf30), C64e(0xa563a584634242e7), + C64e(0x5030504030202070), C64e(0x2e1a2ed11ae5e5cb), + C64e(0x120e12e10efdfdef), C64e(0xb76db7656dbfbf08), + C64e(0xd44cd4194c818155), C64e(0x3c143c3014181824), + C64e(0x5f355f4c35262679), C64e(0x712f719d2fc3c3b2), + C64e(0x38e13867e1bebe86), C64e(0xfda2fd6aa23535c8), + C64e(0x4fcc4f0bcc8888c7), C64e(0x4b394b5c392e2e65), + C64e(0xf957f93d5793936a), C64e(0x0df20daaf2555558), + C64e(0x9d829de382fcfc61), C64e(0xc947c9f4477a7ab3), + C64e(0xefacef8bacc8c827), C64e(0x32e7326fe7baba88), + C64e(0x7d2b7d642b32324f), C64e(0xa495a4d795e6e642), + C64e(0xfba0fb9ba0c0c03b), C64e(0xb398b332981919aa), + C64e(0x68d16827d19e9ef6), C64e(0x817f815d7fa3a322), + C64e(0xaa66aa88664444ee), C64e(0x827e82a87e5454d6), + C64e(0xe6abe676ab3b3bdd), C64e(0x9e839e16830b0b95), + C64e(0x45ca4503ca8c8cc9), C64e(0x7b297b9529c7c7bc), + C64e(0x6ed36ed6d36b6b05), C64e(0x443c44503c28286c), + C64e(0x8b798b5579a7a72c), C64e(0x3de23d63e2bcbc81), + C64e(0x271d272c1d161631), C64e(0x9a769a4176adad37), + C64e(0x4d3b4dad3bdbdb96), C64e(0xfa56fac85664649e), + C64e(0xd24ed2e84e7474a6), C64e(0x221e22281e141436), + C64e(0x76db763fdb9292e4), C64e(0x1e0a1e180a0c0c12), + C64e(0xb46cb4906c4848fc), C64e(0x37e4376be4b8b88f), + C64e(0xe75de7255d9f9f78), C64e(0xb26eb2616ebdbd0f), + C64e(0x2aef2a86ef434369), C64e(0xf1a6f193a6c4c435), + C64e(0xe3a8e372a83939da), C64e(0xf7a4f762a43131c6), + C64e(0x593759bd37d3d38a), C64e(0x868b86ff8bf2f274), + C64e(0x563256b132d5d583), C64e(0xc543c50d438b8b4e), + C64e(0xeb59ebdc596e6e85), C64e(0xc2b7c2afb7dada18), + C64e(0x8f8c8f028c01018e), C64e(0xac64ac7964b1b11d), + C64e(0x6dd26d23d29c9cf1), C64e(0x3be03b92e0494972), + C64e(0xc7b4c7abb4d8d81f), C64e(0x15fa1543faacacb9), + C64e(0x090709fd07f3f3fa), C64e(0x6f256f8525cfcfa0), + C64e(0xeaafea8fafcaca20), C64e(0x898e89f38ef4f47d), + C64e(0x20e9208ee9474767), C64e(0x2818282018101038), + C64e(0x64d564ded56f6f0b), C64e(0x838883fb88f0f073), + C64e(0xb16fb1946f4a4afb), C64e(0x967296b8725c5cca), + C64e(0x6c246c7024383854), C64e(0x08f108aef157575f), + C64e(0x52c752e6c7737321), C64e(0xf351f33551979764), + C64e(0x6523658d23cbcbae), C64e(0x847c84597ca1a125), + C64e(0xbf9cbfcb9ce8e857), C64e(0x6321637c213e3e5d), + C64e(0x7cdd7c37dd9696ea), C64e(0x7fdc7fc2dc61611e), + C64e(0x9186911a860d0d9c), C64e(0x9485941e850f0f9b), + C64e(0xab90abdb90e0e04b), C64e(0xc642c6f8427c7cba), + C64e(0x57c457e2c4717126), C64e(0xe5aae583aacccc29), + C64e(0x73d8733bd89090e3), C64e(0x0f050f0c05060609), + C64e(0x030103f501f7f7f4), C64e(0x36123638121c1c2a), + C64e(0xfea3fe9fa3c2c23c), C64e(0xe15fe1d45f6a6a8b), + C64e(0x10f91047f9aeaebe), C64e(0x6bd06bd2d0696902), + C64e(0xa891a82e911717bf), C64e(0xe858e82958999971), + C64e(0x69276974273a3a53), C64e(0xd0b9d04eb92727f7), + C64e(0x483848a938d9d991), C64e(0x351335cd13ebebde), + C64e(0xceb3ce56b32b2be5), C64e(0x5533554433222277), + C64e(0xd6bbd6bfbbd2d204), C64e(0x9070904970a9a939), + C64e(0x8089800e89070787), C64e(0xf2a7f266a73333c1), + C64e(0xc1b6c15ab62d2dec), C64e(0x66226678223c3c5a), + C64e(0xad92ad2a921515b8), C64e(0x6020608920c9c9a9), + C64e(0xdb49db154987875c), C64e(0x1aff1a4fffaaaab0), + C64e(0x887888a0785050d8), C64e(0x8e7a8e517aa5a52b), + C64e(0x8a8f8a068f030389), C64e(0x13f813b2f859594a), + C64e(0x9b809b1280090992), C64e(0x39173934171a1a23), + C64e(0x75da75cada656510), C64e(0x533153b531d7d784), + C64e(0x51c65113c68484d5), C64e(0xd3b8d3bbb8d0d003), + C64e(0x5ec35e1fc38282dc), C64e(0xcbb0cb52b02929e2), + C64e(0x997799b4775a5ac3), C64e(0x3311333c111e1e2d), + C64e(0x46cb46f6cb7b7b3d), C64e(0x1ffc1f4bfca8a8b7), + C64e(0x61d661dad66d6d0c), C64e(0x4e3a4e583a2c2c62) +}; + +static const sph_u64 T7[] = { + C64e(0x32f4a5f497a5c6c6), C64e(0x6f978497eb84f8f8), + C64e(0x5eb099b0c799eeee), C64e(0x7a8c8d8cf78df6f6), + C64e(0xe8170d17e50dffff), C64e(0x0adcbddcb7bdd6d6), + C64e(0x16c8b1c8a7b1dede), C64e(0x6dfc54fc39549191), + C64e(0x90f050f0c0506060), C64e(0x0705030504030202), + C64e(0x2ee0a9e087a9cece), C64e(0xd1877d87ac7d5656), + C64e(0xcc2b192bd519e7e7), C64e(0x13a662a67162b5b5), + C64e(0x7c31e6319ae64d4d), C64e(0x59b59ab5c39aecec), + C64e(0x40cf45cf05458f8f), C64e(0xa3bc9dbc3e9d1f1f), + C64e(0x49c040c009408989), C64e(0x68928792ef87fafa), + C64e(0xd03f153fc515efef), C64e(0x9426eb267febb2b2), + C64e(0xce40c94007c98e8e), C64e(0xe61d0b1ded0bfbfb), + C64e(0x6e2fec2f82ec4141), C64e(0x1aa967a97d67b3b3), + C64e(0x431cfd1cbefd5f5f), C64e(0x6025ea258aea4545), + C64e(0xf9dabfda46bf2323), C64e(0x5102f702a6f75353), + C64e(0x45a196a1d396e4e4), C64e(0x76ed5bed2d5b9b9b), + C64e(0x285dc25deac27575), C64e(0xc5241c24d91ce1e1), + C64e(0xd4e9aee97aae3d3d), C64e(0xf2be6abe986a4c4c), + C64e(0x82ee5aeed85a6c6c), C64e(0xbdc341c3fc417e7e), + C64e(0xf3060206f102f5f5), C64e(0x52d14fd11d4f8383), + C64e(0x8ce45ce4d05c6868), C64e(0x5607f407a2f45151), + C64e(0x8d5c345cb934d1d1), C64e(0xe1180818e908f9f9), + C64e(0x4cae93aedf93e2e2), C64e(0x3e9573954d73abab), + C64e(0x97f553f5c4536262), C64e(0x6b413f41543f2a2a), + C64e(0x1c140c14100c0808), C64e(0x63f652f631529595), + C64e(0xe9af65af8c654646), C64e(0x7fe25ee2215e9d9d), + C64e(0x4878287860283030), C64e(0xcff8a1f86ea13737), + C64e(0x1b110f11140f0a0a), C64e(0xebc4b5c45eb52f2f), + C64e(0x151b091b1c090e0e), C64e(0x7e5a365a48362424), + C64e(0xadb69bb6369b1b1b), C64e(0x98473d47a53ddfdf), + C64e(0xa76a266a8126cdcd), C64e(0xf5bb69bb9c694e4e), + C64e(0x334ccd4cfecd7f7f), C64e(0x50ba9fbacf9feaea), + C64e(0x3f2d1b2d241b1212), C64e(0xa4b99eb93a9e1d1d), + C64e(0xc49c749cb0745858), C64e(0x46722e72682e3434), + C64e(0x41772d776c2d3636), C64e(0x11cdb2cda3b2dcdc), + C64e(0x9d29ee2973eeb4b4), C64e(0x4d16fb16b6fb5b5b), + C64e(0xa501f60153f6a4a4), C64e(0xa1d74dd7ec4d7676), + C64e(0x14a361a37561b7b7), C64e(0x3449ce49face7d7d), + C64e(0xdf8d7b8da47b5252), C64e(0x9f423e42a13edddd), + C64e(0xcd937193bc715e5e), C64e(0xb1a297a226971313), + C64e(0xa204f50457f5a6a6), C64e(0x01b868b86968b9b9), + C64e(0x0000000000000000), C64e(0xb5742c74992cc1c1), + C64e(0xe0a060a080604040), C64e(0xc2211f21dd1fe3e3), + C64e(0x3a43c843f2c87979), C64e(0x9a2ced2c77edb6b6), + C64e(0x0dd9bed9b3bed4d4), C64e(0x47ca46ca01468d8d), + C64e(0x1770d970ced96767), C64e(0xafdd4bdde44b7272), + C64e(0xed79de7933de9494), C64e(0xff67d4672bd49898), + C64e(0x9323e8237be8b0b0), C64e(0x5bde4ade114a8585), + C64e(0x06bd6bbd6d6bbbbb), C64e(0xbb7e2a7e912ac5c5), + C64e(0x7b34e5349ee54f4f), C64e(0xd73a163ac116eded), + C64e(0xd254c55417c58686), C64e(0xf862d7622fd79a9a), + C64e(0x99ff55ffcc556666), C64e(0xb6a794a722941111), + C64e(0xc04acf4a0fcf8a8a), C64e(0xd9301030c910e9e9), + C64e(0x0e0a060a08060404), C64e(0x66988198e781fefe), + C64e(0xab0bf00b5bf0a0a0), C64e(0xb4cc44ccf0447878), + C64e(0xf0d5bad54aba2525), C64e(0x753ee33e96e34b4b), + C64e(0xac0ef30e5ff3a2a2), C64e(0x4419fe19bafe5d5d), + C64e(0xdb5bc05b1bc08080), C64e(0x80858a850a8a0505), + C64e(0xd3ecadec7ead3f3f), C64e(0xfedfbcdf42bc2121), + C64e(0xa8d848d8e0487070), C64e(0xfd0c040cf904f1f1), + C64e(0x197adf7ac6df6363), C64e(0x2f58c158eec17777), + C64e(0x309f759f4575afaf), C64e(0xe7a563a584634242), + C64e(0x7050305040302020), C64e(0xcb2e1a2ed11ae5e5), + C64e(0xef120e12e10efdfd), C64e(0x08b76db7656dbfbf), + C64e(0x55d44cd4194c8181), C64e(0x243c143c30141818), + C64e(0x795f355f4c352626), C64e(0xb2712f719d2fc3c3), + C64e(0x8638e13867e1bebe), C64e(0xc8fda2fd6aa23535), + C64e(0xc74fcc4f0bcc8888), C64e(0x654b394b5c392e2e), + C64e(0x6af957f93d579393), C64e(0x580df20daaf25555), + C64e(0x619d829de382fcfc), C64e(0xb3c947c9f4477a7a), + C64e(0x27efacef8bacc8c8), C64e(0x8832e7326fe7baba), + C64e(0x4f7d2b7d642b3232), C64e(0x42a495a4d795e6e6), + C64e(0x3bfba0fb9ba0c0c0), C64e(0xaab398b332981919), + C64e(0xf668d16827d19e9e), C64e(0x22817f815d7fa3a3), + C64e(0xeeaa66aa88664444), C64e(0xd6827e82a87e5454), + C64e(0xdde6abe676ab3b3b), C64e(0x959e839e16830b0b), + C64e(0xc945ca4503ca8c8c), C64e(0xbc7b297b9529c7c7), + C64e(0x056ed36ed6d36b6b), C64e(0x6c443c44503c2828), + C64e(0x2c8b798b5579a7a7), C64e(0x813de23d63e2bcbc), + C64e(0x31271d272c1d1616), C64e(0x379a769a4176adad), + C64e(0x964d3b4dad3bdbdb), C64e(0x9efa56fac8566464), + C64e(0xa6d24ed2e84e7474), C64e(0x36221e22281e1414), + C64e(0xe476db763fdb9292), C64e(0x121e0a1e180a0c0c), + C64e(0xfcb46cb4906c4848), C64e(0x8f37e4376be4b8b8), + C64e(0x78e75de7255d9f9f), C64e(0x0fb26eb2616ebdbd), + C64e(0x692aef2a86ef4343), C64e(0x35f1a6f193a6c4c4), + C64e(0xdae3a8e372a83939), C64e(0xc6f7a4f762a43131), + C64e(0x8a593759bd37d3d3), C64e(0x74868b86ff8bf2f2), + C64e(0x83563256b132d5d5), C64e(0x4ec543c50d438b8b), + C64e(0x85eb59ebdc596e6e), C64e(0x18c2b7c2afb7dada), + C64e(0x8e8f8c8f028c0101), C64e(0x1dac64ac7964b1b1), + C64e(0xf16dd26d23d29c9c), C64e(0x723be03b92e04949), + C64e(0x1fc7b4c7abb4d8d8), C64e(0xb915fa1543faacac), + C64e(0xfa090709fd07f3f3), C64e(0xa06f256f8525cfcf), + C64e(0x20eaafea8fafcaca), C64e(0x7d898e89f38ef4f4), + C64e(0x6720e9208ee94747), C64e(0x3828182820181010), + C64e(0x0b64d564ded56f6f), C64e(0x73838883fb88f0f0), + C64e(0xfbb16fb1946f4a4a), C64e(0xca967296b8725c5c), + C64e(0x546c246c70243838), C64e(0x5f08f108aef15757), + C64e(0x2152c752e6c77373), C64e(0x64f351f335519797), + C64e(0xae6523658d23cbcb), C64e(0x25847c84597ca1a1), + C64e(0x57bf9cbfcb9ce8e8), C64e(0x5d6321637c213e3e), + C64e(0xea7cdd7c37dd9696), C64e(0x1e7fdc7fc2dc6161), + C64e(0x9c9186911a860d0d), C64e(0x9b9485941e850f0f), + C64e(0x4bab90abdb90e0e0), C64e(0xbac642c6f8427c7c), + C64e(0x2657c457e2c47171), C64e(0x29e5aae583aacccc), + C64e(0xe373d8733bd89090), C64e(0x090f050f0c050606), + C64e(0xf4030103f501f7f7), C64e(0x2a36123638121c1c), + C64e(0x3cfea3fe9fa3c2c2), C64e(0x8be15fe1d45f6a6a), + C64e(0xbe10f91047f9aeae), C64e(0x026bd06bd2d06969), + C64e(0xbfa891a82e911717), C64e(0x71e858e829589999), + C64e(0x5369276974273a3a), C64e(0xf7d0b9d04eb92727), + C64e(0x91483848a938d9d9), C64e(0xde351335cd13ebeb), + C64e(0xe5ceb3ce56b32b2b), C64e(0x7755335544332222), + C64e(0x04d6bbd6bfbbd2d2), C64e(0x399070904970a9a9), + C64e(0x878089800e890707), C64e(0xc1f2a7f266a73333), + C64e(0xecc1b6c15ab62d2d), C64e(0x5a66226678223c3c), + C64e(0xb8ad92ad2a921515), C64e(0xa96020608920c9c9), + C64e(0x5cdb49db15498787), C64e(0xb01aff1a4fffaaaa), + C64e(0xd8887888a0785050), C64e(0x2b8e7a8e517aa5a5), + C64e(0x898a8f8a068f0303), C64e(0x4a13f813b2f85959), + C64e(0x929b809b12800909), C64e(0x2339173934171a1a), + C64e(0x1075da75cada6565), C64e(0x84533153b531d7d7), + C64e(0xd551c65113c68484), C64e(0x03d3b8d3bbb8d0d0), + C64e(0xdc5ec35e1fc38282), C64e(0xe2cbb0cb52b02929), + C64e(0xc3997799b4775a5a), C64e(0x2d3311333c111e1e), + C64e(0x3d46cb46f6cb7b7b), C64e(0xb71ffc1f4bfca8a8), + C64e(0x0c61d661dad66d6d), C64e(0x624e3a4e583a2c2c) +}; + +#endif + +#define DECL_STATE_SMALL \ + sph_u64 H[8]; + +#define READ_STATE_SMALL(sc) do { \ + memcpy(H, (sc)->state.wide, sizeof H); \ + } while (0) + +#define WRITE_STATE_SMALL(sc) do { \ + memcpy((sc)->state.wide, H, sizeof H); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ R64(T0[B64_1(a[b1])], 8) \ + ^ R64(T0[B64_2(a[b2])], 16) \ + ^ R64(T0[B64_3(a[b3])], 24) \ + ^ T4[B64_4(a[b4])] \ + ^ R64(T4[B64_5(a[b5])], 8) \ + ^ R64(T4[B64_6(a[b6])], 16) \ + ^ R64(T4[B64_7(a[b7])], 24); \ + } while (0) + +#else + +#define RSTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ T1[B64_1(a[b1])] \ + ^ T2[B64_2(a[b2])] \ + ^ T3[B64_3(a[b3])] \ + ^ T4[B64_4(a[b4])] \ + ^ T5[B64_5(a[b5])] \ + ^ T6[B64_6(a[b6])] \ + ^ T7[B64_7(a[b7])]; \ + } while (0) + +#endif + +#define ROUND_SMALL_P(a, r) do { \ + sph_u64 t[8]; \ + a[0] ^= PC64(0x00, r); \ + a[1] ^= PC64(0x10, r); \ + a[2] ^= PC64(0x20, r); \ + a[3] ^= PC64(0x30, r); \ + a[4] ^= PC64(0x40, r); \ + a[5] ^= PC64(0x50, r); \ + a[6] ^= PC64(0x60, r); \ + a[7] ^= PC64(0x70, r); \ + RSTT(0, a, 0, 1, 2, 3, 4, 5, 6, 7); \ + RSTT(1, a, 1, 2, 3, 4, 5, 6, 7, 0); \ + RSTT(2, a, 2, 3, 4, 5, 6, 7, 0, 1); \ + RSTT(3, a, 3, 4, 5, 6, 7, 0, 1, 2); \ + RSTT(4, a, 4, 5, 6, 7, 0, 1, 2, 3); \ + RSTT(5, a, 5, 6, 7, 0, 1, 2, 3, 4); \ + RSTT(6, a, 6, 7, 0, 1, 2, 3, 4, 5); \ + RSTT(7, a, 7, 0, 1, 2, 3, 4, 5, 6); \ + a[0] = t[0]; \ + a[1] = t[1]; \ + a[2] = t[2]; \ + a[3] = t[3]; \ + a[4] = t[4]; \ + a[5] = t[5]; \ + a[6] = t[6]; \ + a[7] = t[7]; \ + } while (0) + +#define ROUND_SMALL_Q(a, r) do { \ + sph_u64 t[8]; \ + a[0] ^= QC64(0x00, r); \ + a[1] ^= QC64(0x10, r); \ + a[2] ^= QC64(0x20, r); \ + a[3] ^= QC64(0x30, r); \ + a[4] ^= QC64(0x40, r); \ + a[5] ^= QC64(0x50, r); \ + a[6] ^= QC64(0x60, r); \ + a[7] ^= QC64(0x70, r); \ + RSTT(0, a, 1, 3, 5, 7, 0, 2, 4, 6); \ + RSTT(1, a, 2, 4, 6, 0, 1, 3, 5, 7); \ + RSTT(2, a, 3, 5, 7, 1, 2, 4, 6, 0); \ + RSTT(3, a, 4, 6, 0, 2, 3, 5, 7, 1); \ + RSTT(4, a, 5, 7, 1, 3, 4, 6, 0, 2); \ + RSTT(5, a, 6, 0, 2, 4, 5, 7, 1, 3); \ + RSTT(6, a, 7, 1, 3, 5, 6, 0, 2, 4); \ + RSTT(7, a, 0, 2, 4, 6, 7, 1, 3, 5); \ + a[0] = t[0]; \ + a[1] = t[1]; \ + a[2] = t[2]; \ + a[3] = t[3]; \ + a[4] = t[4]; \ + a[5] = t[5]; \ + a[6] = t[6]; \ + a[7] = t[7]; \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_P(a, r); \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_Q(a, r); \ + } while (0) + +#else + +/* + * Apparently, unrolling more than that confuses GCC, resulting in + * lower performance, even though L1 cache would be no problem. + */ +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_P(a, r + 0); \ + ROUND_SMALL_P(a, r + 1); \ + } \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_Q(a, r + 0); \ + ROUND_SMALL_Q(a, r + 1); \ + } \ + } while (0) + +#endif + +#define COMPRESS_SMALL do { \ + sph_u64 g[8], m[8]; \ + size_t u; \ + for (u = 0; u < 8; u ++) { \ + m[u] = dec64e_aligned(buf + (u << 3)); \ + g[u] = m[u] ^ H[u]; \ + } \ + PERM_SMALL_P(g); \ + PERM_SMALL_Q(m); \ + for (u = 0; u < 8; u ++) \ + H[u] ^= g[u] ^ m[u]; \ + } while (0) + +#define FINAL_SMALL do { \ + sph_u64 x[8]; \ + size_t u; \ + memcpy(x, H, sizeof x); \ + PERM_SMALL_P(x); \ + for (u = 0; u < 8; u ++) \ + H[u] ^= x[u]; \ + } while (0) + +#define DECL_STATE_BIG \ + sph_u64 H[16]; + +#define READ_STATE_BIG(sc) do { \ + memcpy(H, (sc)->state.wide, sizeof H); \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + memcpy((sc)->state.wide, H, sizeof H); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ R64(T0[B64_1(a[b1])], 8) \ + ^ R64(T0[B64_2(a[b2])], 16) \ + ^ R64(T0[B64_3(a[b3])], 24) \ + ^ T4[B64_4(a[b4])] \ + ^ R64(T4[B64_5(a[b5])], 8) \ + ^ R64(T4[B64_6(a[b6])], 16) \ + ^ R64(T4[B64_7(a[b7])], 24); \ + } while (0) + +#else + +#define RBTT(d, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d] = T0[B64_0(a[b0])] \ + ^ T1[B64_1(a[b1])] \ + ^ T2[B64_2(a[b2])] \ + ^ T3[B64_3(a[b3])] \ + ^ T4[B64_4(a[b4])] \ + ^ T5[B64_5(a[b5])] \ + ^ T6[B64_6(a[b6])] \ + ^ T7[B64_7(a[b7])]; \ + } while (0) + +#endif + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define ROUND_BIG_P(a, r) do { \ + sph_u64 t[16]; \ + size_t u; \ + a[0x0] ^= PC64(0x00, r); \ + a[0x1] ^= PC64(0x10, r); \ + a[0x2] ^= PC64(0x20, r); \ + a[0x3] ^= PC64(0x30, r); \ + a[0x4] ^= PC64(0x40, r); \ + a[0x5] ^= PC64(0x50, r); \ + a[0x6] ^= PC64(0x60, r); \ + a[0x7] ^= PC64(0x70, r); \ + a[0x8] ^= PC64(0x80, r); \ + a[0x9] ^= PC64(0x90, r); \ + a[0xA] ^= PC64(0xA0, r); \ + a[0xB] ^= PC64(0xB0, r); \ + a[0xC] ^= PC64(0xC0, r); \ + a[0xD] ^= PC64(0xD0, r); \ + a[0xE] ^= PC64(0xE0, r); \ + a[0xF] ^= PC64(0xF0, r); \ + for (u = 0; u < 16; u += 4) { \ + RBTT(u + 0, a, u + 0, (u + 1) & 0xF, \ + (u + 2) & 0xF, (u + 3) & 0xF, (u + 4) & 0xF, \ + (u + 5) & 0xF, (u + 6) & 0xF, (u + 11) & 0xF); \ + RBTT(u + 1, a, u + 1, (u + 2) & 0xF, \ + (u + 3) & 0xF, (u + 4) & 0xF, (u + 5) & 0xF, \ + (u + 6) & 0xF, (u + 7) & 0xF, (u + 12) & 0xF); \ + RBTT(u + 2, a, u + 2, (u + 3) & 0xF, \ + (u + 4) & 0xF, (u + 5) & 0xF, (u + 6) & 0xF, \ + (u + 7) & 0xF, (u + 8) & 0xF, (u + 13) & 0xF); \ + RBTT(u + 3, a, u + 3, (u + 4) & 0xF, \ + (u + 5) & 0xF, (u + 6) & 0xF, (u + 7) & 0xF, \ + (u + 8) & 0xF, (u + 9) & 0xF, (u + 14) & 0xF); \ + } \ + memcpy(a, t, sizeof t); \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + sph_u64 t[16]; \ + size_t u; \ + a[0x0] ^= QC64(0x00, r); \ + a[0x1] ^= QC64(0x10, r); \ + a[0x2] ^= QC64(0x20, r); \ + a[0x3] ^= QC64(0x30, r); \ + a[0x4] ^= QC64(0x40, r); \ + a[0x5] ^= QC64(0x50, r); \ + a[0x6] ^= QC64(0x60, r); \ + a[0x7] ^= QC64(0x70, r); \ + a[0x8] ^= QC64(0x80, r); \ + a[0x9] ^= QC64(0x90, r); \ + a[0xA] ^= QC64(0xA0, r); \ + a[0xB] ^= QC64(0xB0, r); \ + a[0xC] ^= QC64(0xC0, r); \ + a[0xD] ^= QC64(0xD0, r); \ + a[0xE] ^= QC64(0xE0, r); \ + a[0xF] ^= QC64(0xF0, r); \ + for (u = 0; u < 16; u += 4) { \ + RBTT(u + 0, a, (u + 1) & 0xF, (u + 3) & 0xF, \ + (u + 5) & 0xF, (u + 11) & 0xF, (u + 0) & 0xF, \ + (u + 2) & 0xF, (u + 4) & 0xF, (u + 6) & 0xF); \ + RBTT(u + 1, a, (u + 2) & 0xF, (u + 4) & 0xF, \ + (u + 6) & 0xF, (u + 12) & 0xF, (u + 1) & 0xF, \ + (u + 3) & 0xF, (u + 5) & 0xF, (u + 7) & 0xF); \ + RBTT(u + 2, a, (u + 3) & 0xF, (u + 5) & 0xF, \ + (u + 7) & 0xF, (u + 13) & 0xF, (u + 2) & 0xF, \ + (u + 4) & 0xF, (u + 6) & 0xF, (u + 8) & 0xF); \ + RBTT(u + 3, a, (u + 4) & 0xF, (u + 6) & 0xF, \ + (u + 8) & 0xF, (u + 14) & 0xF, (u + 3) & 0xF, \ + (u + 5) & 0xF, (u + 7) & 0xF, (u + 9) & 0xF); \ + } \ + memcpy(a, t, sizeof t); \ + } while (0) + +#else + +#define ROUND_BIG_P(a, r) do { \ + sph_u64 t[16]; \ + a[0x0] ^= PC64(0x00, r); \ + a[0x1] ^= PC64(0x10, r); \ + a[0x2] ^= PC64(0x20, r); \ + a[0x3] ^= PC64(0x30, r); \ + a[0x4] ^= PC64(0x40, r); \ + a[0x5] ^= PC64(0x50, r); \ + a[0x6] ^= PC64(0x60, r); \ + a[0x7] ^= PC64(0x70, r); \ + a[0x8] ^= PC64(0x80, r); \ + a[0x9] ^= PC64(0x90, r); \ + a[0xA] ^= PC64(0xA0, r); \ + a[0xB] ^= PC64(0xB0, r); \ + a[0xC] ^= PC64(0xC0, r); \ + a[0xD] ^= PC64(0xD0, r); \ + a[0xE] ^= PC64(0xE0, r); \ + a[0xF] ^= PC64(0xF0, r); \ + RBTT(0x0, a, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0xB); \ + RBTT(0x1, a, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0xC); \ + RBTT(0x2, a, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0xD); \ + RBTT(0x3, a, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xE); \ + RBTT(0x4, a, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xF); \ + RBTT(0x5, a, 0x5, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0x0); \ + RBTT(0x6, a, 0x6, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0x1); \ + RBTT(0x7, a, 0x7, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0x2); \ + RBTT(0x8, a, 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0x3); \ + RBTT(0x9, a, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x4); \ + RBTT(0xA, a, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x5); \ + RBTT(0xB, a, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x6); \ + RBTT(0xC, a, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x7); \ + RBTT(0xD, a, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x8); \ + RBTT(0xE, a, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x9); \ + RBTT(0xF, a, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0xA); \ + a[0x0] = t[0x0]; \ + a[0x1] = t[0x1]; \ + a[0x2] = t[0x2]; \ + a[0x3] = t[0x3]; \ + a[0x4] = t[0x4]; \ + a[0x5] = t[0x5]; \ + a[0x6] = t[0x6]; \ + a[0x7] = t[0x7]; \ + a[0x8] = t[0x8]; \ + a[0x9] = t[0x9]; \ + a[0xA] = t[0xA]; \ + a[0xB] = t[0xB]; \ + a[0xC] = t[0xC]; \ + a[0xD] = t[0xD]; \ + a[0xE] = t[0xE]; \ + a[0xF] = t[0xF]; \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + sph_u64 t[16]; \ + a[0x0] ^= QC64(0x00, r); \ + a[0x1] ^= QC64(0x10, r); \ + a[0x2] ^= QC64(0x20, r); \ + a[0x3] ^= QC64(0x30, r); \ + a[0x4] ^= QC64(0x40, r); \ + a[0x5] ^= QC64(0x50, r); \ + a[0x6] ^= QC64(0x60, r); \ + a[0x7] ^= QC64(0x70, r); \ + a[0x8] ^= QC64(0x80, r); \ + a[0x9] ^= QC64(0x90, r); \ + a[0xA] ^= QC64(0xA0, r); \ + a[0xB] ^= QC64(0xB0, r); \ + a[0xC] ^= QC64(0xC0, r); \ + a[0xD] ^= QC64(0xD0, r); \ + a[0xE] ^= QC64(0xE0, r); \ + a[0xF] ^= QC64(0xF0, r); \ + RBTT(0x0, a, 0x1, 0x3, 0x5, 0xB, 0x0, 0x2, 0x4, 0x6); \ + RBTT(0x1, a, 0x2, 0x4, 0x6, 0xC, 0x1, 0x3, 0x5, 0x7); \ + RBTT(0x2, a, 0x3, 0x5, 0x7, 0xD, 0x2, 0x4, 0x6, 0x8); \ + RBTT(0x3, a, 0x4, 0x6, 0x8, 0xE, 0x3, 0x5, 0x7, 0x9); \ + RBTT(0x4, a, 0x5, 0x7, 0x9, 0xF, 0x4, 0x6, 0x8, 0xA); \ + RBTT(0x5, a, 0x6, 0x8, 0xA, 0x0, 0x5, 0x7, 0x9, 0xB); \ + RBTT(0x6, a, 0x7, 0x9, 0xB, 0x1, 0x6, 0x8, 0xA, 0xC); \ + RBTT(0x7, a, 0x8, 0xA, 0xC, 0x2, 0x7, 0x9, 0xB, 0xD); \ + RBTT(0x8, a, 0x9, 0xB, 0xD, 0x3, 0x8, 0xA, 0xC, 0xE); \ + RBTT(0x9, a, 0xA, 0xC, 0xE, 0x4, 0x9, 0xB, 0xD, 0xF); \ + RBTT(0xA, a, 0xB, 0xD, 0xF, 0x5, 0xA, 0xC, 0xE, 0x0); \ + RBTT(0xB, a, 0xC, 0xE, 0x0, 0x6, 0xB, 0xD, 0xF, 0x1); \ + RBTT(0xC, a, 0xD, 0xF, 0x1, 0x7, 0xC, 0xE, 0x0, 0x2); \ + RBTT(0xD, a, 0xE, 0x0, 0x2, 0x8, 0xD, 0xF, 0x1, 0x3); \ + RBTT(0xE, a, 0xF, 0x1, 0x3, 0x9, 0xE, 0x0, 0x2, 0x4); \ + RBTT(0xF, a, 0x0, 0x2, 0x4, 0xA, 0xF, 0x1, 0x3, 0x5); \ + a[0x0] = t[0x0]; \ + a[0x1] = t[0x1]; \ + a[0x2] = t[0x2]; \ + a[0x3] = t[0x3]; \ + a[0x4] = t[0x4]; \ + a[0x5] = t[0x5]; \ + a[0x6] = t[0x6]; \ + a[0x7] = t[0x7]; \ + a[0x8] = t[0x8]; \ + a[0x9] = t[0x9]; \ + a[0xA] = t[0xA]; \ + a[0xB] = t[0xB]; \ + a[0xC] = t[0xC]; \ + a[0xD] = t[0xD]; \ + a[0xE] = t[0xE]; \ + a[0xF] = t[0xF]; \ + } while (0) + +#endif + +#define PERM_BIG_P(a) do { \ + int r; \ + for (r = 0; r < 14; r += 2) { \ + ROUND_BIG_P(a, r + 0); \ + ROUND_BIG_P(a, r + 1); \ + } \ + } while (0) + +#define PERM_BIG_Q(a) do { \ + int r; \ + for (r = 0; r < 14; r += 2) { \ + ROUND_BIG_Q(a, r + 0); \ + ROUND_BIG_Q(a, r + 1); \ + } \ + } while (0) + +/* obsolete +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define COMPRESS_BIG do { \ + sph_u64 g[16], m[16], *ya; \ + const sph_u64 *yc; \ + size_t u; \ + int i; \ + for (u = 0; u < 16; u ++) { \ + m[u] = dec64e_aligned(buf + (u << 3)); \ + g[u] = m[u] ^ H[u]; \ + } \ + ya = g; \ + yc = CP; \ + for (i = 0; i < 2; i ++) { \ + PERM_BIG(ya, yc); \ + ya = m; \ + yc = CQ; \ + } \ + for (u = 0; u < 16; u ++) { \ + H[u] ^= g[u] ^ m[u]; \ + } \ + } while (0) + +#else +*/ + +#define COMPRESS_BIG do { \ + sph_u64 g[16], m[16]; \ + size_t u; \ + for (u = 0; u < 16; u ++) { \ + m[u] = dec64e_aligned(buf + (u << 3)); \ + g[u] = m[u] ^ H[u]; \ + } \ + PERM_BIG_P(g); \ + PERM_BIG_Q(m); \ + for (u = 0; u < 16; u ++) { \ + H[u] ^= g[u] ^ m[u]; \ + } \ + } while (0) + +/* obsolete +#endif +*/ + +#define FINAL_BIG do { \ + sph_u64 x[16]; \ + size_t u; \ + memcpy(x, H, sizeof x); \ + PERM_BIG_P(x); \ + for (u = 0; u < 16; u ++) \ + H[u] ^= x[u]; \ + } while (0) + +#else + +static const sph_u32 T0up[] = { + C32e(0xc632f4a5), C32e(0xf86f9784), C32e(0xee5eb099), C32e(0xf67a8c8d), + C32e(0xffe8170d), C32e(0xd60adcbd), C32e(0xde16c8b1), C32e(0x916dfc54), + C32e(0x6090f050), C32e(0x02070503), C32e(0xce2ee0a9), C32e(0x56d1877d), + C32e(0xe7cc2b19), C32e(0xb513a662), C32e(0x4d7c31e6), C32e(0xec59b59a), + C32e(0x8f40cf45), C32e(0x1fa3bc9d), C32e(0x8949c040), C32e(0xfa689287), + C32e(0xefd03f15), C32e(0xb29426eb), C32e(0x8ece40c9), C32e(0xfbe61d0b), + C32e(0x416e2fec), C32e(0xb31aa967), C32e(0x5f431cfd), C32e(0x456025ea), + C32e(0x23f9dabf), C32e(0x535102f7), C32e(0xe445a196), C32e(0x9b76ed5b), + C32e(0x75285dc2), C32e(0xe1c5241c), C32e(0x3dd4e9ae), C32e(0x4cf2be6a), + C32e(0x6c82ee5a), C32e(0x7ebdc341), C32e(0xf5f30602), C32e(0x8352d14f), + C32e(0x688ce45c), C32e(0x515607f4), C32e(0xd18d5c34), C32e(0xf9e11808), + C32e(0xe24cae93), C32e(0xab3e9573), C32e(0x6297f553), C32e(0x2a6b413f), + C32e(0x081c140c), C32e(0x9563f652), C32e(0x46e9af65), C32e(0x9d7fe25e), + C32e(0x30487828), C32e(0x37cff8a1), C32e(0x0a1b110f), C32e(0x2febc4b5), + C32e(0x0e151b09), C32e(0x247e5a36), C32e(0x1badb69b), C32e(0xdf98473d), + C32e(0xcda76a26), C32e(0x4ef5bb69), C32e(0x7f334ccd), C32e(0xea50ba9f), + C32e(0x123f2d1b), C32e(0x1da4b99e), C32e(0x58c49c74), C32e(0x3446722e), + C32e(0x3641772d), C32e(0xdc11cdb2), C32e(0xb49d29ee), C32e(0x5b4d16fb), + C32e(0xa4a501f6), C32e(0x76a1d74d), C32e(0xb714a361), C32e(0x7d3449ce), + C32e(0x52df8d7b), C32e(0xdd9f423e), C32e(0x5ecd9371), C32e(0x13b1a297), + C32e(0xa6a204f5), C32e(0xb901b868), C32e(0x00000000), C32e(0xc1b5742c), + C32e(0x40e0a060), C32e(0xe3c2211f), C32e(0x793a43c8), C32e(0xb69a2ced), + C32e(0xd40dd9be), C32e(0x8d47ca46), C32e(0x671770d9), C32e(0x72afdd4b), + C32e(0x94ed79de), C32e(0x98ff67d4), C32e(0xb09323e8), C32e(0x855bde4a), + C32e(0xbb06bd6b), C32e(0xc5bb7e2a), C32e(0x4f7b34e5), C32e(0xedd73a16), + C32e(0x86d254c5), C32e(0x9af862d7), C32e(0x6699ff55), C32e(0x11b6a794), + C32e(0x8ac04acf), C32e(0xe9d93010), C32e(0x040e0a06), C32e(0xfe669881), + C32e(0xa0ab0bf0), C32e(0x78b4cc44), C32e(0x25f0d5ba), C32e(0x4b753ee3), + C32e(0xa2ac0ef3), C32e(0x5d4419fe), C32e(0x80db5bc0), C32e(0x0580858a), + C32e(0x3fd3ecad), C32e(0x21fedfbc), C32e(0x70a8d848), C32e(0xf1fd0c04), + C32e(0x63197adf), C32e(0x772f58c1), C32e(0xaf309f75), C32e(0x42e7a563), + C32e(0x20705030), C32e(0xe5cb2e1a), C32e(0xfdef120e), C32e(0xbf08b76d), + C32e(0x8155d44c), C32e(0x18243c14), C32e(0x26795f35), C32e(0xc3b2712f), + C32e(0xbe8638e1), C32e(0x35c8fda2), C32e(0x88c74fcc), C32e(0x2e654b39), + C32e(0x936af957), C32e(0x55580df2), C32e(0xfc619d82), C32e(0x7ab3c947), + C32e(0xc827efac), C32e(0xba8832e7), C32e(0x324f7d2b), C32e(0xe642a495), + C32e(0xc03bfba0), C32e(0x19aab398), C32e(0x9ef668d1), C32e(0xa322817f), + C32e(0x44eeaa66), C32e(0x54d6827e), C32e(0x3bdde6ab), C32e(0x0b959e83), + C32e(0x8cc945ca), C32e(0xc7bc7b29), C32e(0x6b056ed3), C32e(0x286c443c), + C32e(0xa72c8b79), C32e(0xbc813de2), C32e(0x1631271d), C32e(0xad379a76), + C32e(0xdb964d3b), C32e(0x649efa56), C32e(0x74a6d24e), C32e(0x1436221e), + C32e(0x92e476db), C32e(0x0c121e0a), C32e(0x48fcb46c), C32e(0xb88f37e4), + C32e(0x9f78e75d), C32e(0xbd0fb26e), C32e(0x43692aef), C32e(0xc435f1a6), + C32e(0x39dae3a8), C32e(0x31c6f7a4), C32e(0xd38a5937), C32e(0xf274868b), + C32e(0xd5835632), C32e(0x8b4ec543), C32e(0x6e85eb59), C32e(0xda18c2b7), + C32e(0x018e8f8c), C32e(0xb11dac64), C32e(0x9cf16dd2), C32e(0x49723be0), + C32e(0xd81fc7b4), C32e(0xacb915fa), C32e(0xf3fa0907), C32e(0xcfa06f25), + C32e(0xca20eaaf), C32e(0xf47d898e), C32e(0x476720e9), C32e(0x10382818), + C32e(0x6f0b64d5), C32e(0xf0738388), C32e(0x4afbb16f), C32e(0x5cca9672), + C32e(0x38546c24), C32e(0x575f08f1), C32e(0x732152c7), C32e(0x9764f351), + C32e(0xcbae6523), C32e(0xa125847c), C32e(0xe857bf9c), C32e(0x3e5d6321), + C32e(0x96ea7cdd), C32e(0x611e7fdc), C32e(0x0d9c9186), C32e(0x0f9b9485), + C32e(0xe04bab90), C32e(0x7cbac642), C32e(0x712657c4), C32e(0xcc29e5aa), + C32e(0x90e373d8), C32e(0x06090f05), C32e(0xf7f40301), C32e(0x1c2a3612), + C32e(0xc23cfea3), C32e(0x6a8be15f), C32e(0xaebe10f9), C32e(0x69026bd0), + C32e(0x17bfa891), C32e(0x9971e858), C32e(0x3a536927), C32e(0x27f7d0b9), + C32e(0xd9914838), C32e(0xebde3513), C32e(0x2be5ceb3), C32e(0x22775533), + C32e(0xd204d6bb), C32e(0xa9399070), C32e(0x07878089), C32e(0x33c1f2a7), + C32e(0x2decc1b6), C32e(0x3c5a6622), C32e(0x15b8ad92), C32e(0xc9a96020), + C32e(0x875cdb49), C32e(0xaab01aff), C32e(0x50d88878), C32e(0xa52b8e7a), + C32e(0x03898a8f), C32e(0x594a13f8), C32e(0x09929b80), C32e(0x1a233917), + C32e(0x651075da), C32e(0xd7845331), C32e(0x84d551c6), C32e(0xd003d3b8), + C32e(0x82dc5ec3), C32e(0x29e2cbb0), C32e(0x5ac39977), C32e(0x1e2d3311), + C32e(0x7b3d46cb), C32e(0xa8b71ffc), C32e(0x6d0c61d6), C32e(0x2c624e3a) +}; + +static const sph_u32 T0dn[] = { + C32e(0xf497a5c6), C32e(0x97eb84f8), C32e(0xb0c799ee), C32e(0x8cf78df6), + C32e(0x17e50dff), C32e(0xdcb7bdd6), C32e(0xc8a7b1de), C32e(0xfc395491), + C32e(0xf0c05060), C32e(0x05040302), C32e(0xe087a9ce), C32e(0x87ac7d56), + C32e(0x2bd519e7), C32e(0xa67162b5), C32e(0x319ae64d), C32e(0xb5c39aec), + C32e(0xcf05458f), C32e(0xbc3e9d1f), C32e(0xc0094089), C32e(0x92ef87fa), + C32e(0x3fc515ef), C32e(0x267febb2), C32e(0x4007c98e), C32e(0x1ded0bfb), + C32e(0x2f82ec41), C32e(0xa97d67b3), C32e(0x1cbefd5f), C32e(0x258aea45), + C32e(0xda46bf23), C32e(0x02a6f753), C32e(0xa1d396e4), C32e(0xed2d5b9b), + C32e(0x5deac275), C32e(0x24d91ce1), C32e(0xe97aae3d), C32e(0xbe986a4c), + C32e(0xeed85a6c), C32e(0xc3fc417e), C32e(0x06f102f5), C32e(0xd11d4f83), + C32e(0xe4d05c68), C32e(0x07a2f451), C32e(0x5cb934d1), C32e(0x18e908f9), + C32e(0xaedf93e2), C32e(0x954d73ab), C32e(0xf5c45362), C32e(0x41543f2a), + C32e(0x14100c08), C32e(0xf6315295), C32e(0xaf8c6546), C32e(0xe2215e9d), + C32e(0x78602830), C32e(0xf86ea137), C32e(0x11140f0a), C32e(0xc45eb52f), + C32e(0x1b1c090e), C32e(0x5a483624), C32e(0xb6369b1b), C32e(0x47a53ddf), + C32e(0x6a8126cd), C32e(0xbb9c694e), C32e(0x4cfecd7f), C32e(0xbacf9fea), + C32e(0x2d241b12), C32e(0xb93a9e1d), C32e(0x9cb07458), C32e(0x72682e34), + C32e(0x776c2d36), C32e(0xcda3b2dc), C32e(0x2973eeb4), C32e(0x16b6fb5b), + C32e(0x0153f6a4), C32e(0xd7ec4d76), C32e(0xa37561b7), C32e(0x49face7d), + C32e(0x8da47b52), C32e(0x42a13edd), C32e(0x93bc715e), C32e(0xa2269713), + C32e(0x0457f5a6), C32e(0xb86968b9), C32e(0x00000000), C32e(0x74992cc1), + C32e(0xa0806040), C32e(0x21dd1fe3), C32e(0x43f2c879), C32e(0x2c77edb6), + C32e(0xd9b3bed4), C32e(0xca01468d), C32e(0x70ced967), C32e(0xdde44b72), + C32e(0x7933de94), C32e(0x672bd498), C32e(0x237be8b0), C32e(0xde114a85), + C32e(0xbd6d6bbb), C32e(0x7e912ac5), C32e(0x349ee54f), C32e(0x3ac116ed), + C32e(0x5417c586), C32e(0x622fd79a), C32e(0xffcc5566), C32e(0xa7229411), + C32e(0x4a0fcf8a), C32e(0x30c910e9), C32e(0x0a080604), C32e(0x98e781fe), + C32e(0x0b5bf0a0), C32e(0xccf04478), C32e(0xd54aba25), C32e(0x3e96e34b), + C32e(0x0e5ff3a2), C32e(0x19bafe5d), C32e(0x5b1bc080), C32e(0x850a8a05), + C32e(0xec7ead3f), C32e(0xdf42bc21), C32e(0xd8e04870), C32e(0x0cf904f1), + C32e(0x7ac6df63), C32e(0x58eec177), C32e(0x9f4575af), C32e(0xa5846342), + C32e(0x50403020), C32e(0x2ed11ae5), C32e(0x12e10efd), C32e(0xb7656dbf), + C32e(0xd4194c81), C32e(0x3c301418), C32e(0x5f4c3526), C32e(0x719d2fc3), + C32e(0x3867e1be), C32e(0xfd6aa235), C32e(0x4f0bcc88), C32e(0x4b5c392e), + C32e(0xf93d5793), C32e(0x0daaf255), C32e(0x9de382fc), C32e(0xc9f4477a), + C32e(0xef8bacc8), C32e(0x326fe7ba), C32e(0x7d642b32), C32e(0xa4d795e6), + C32e(0xfb9ba0c0), C32e(0xb3329819), C32e(0x6827d19e), C32e(0x815d7fa3), + C32e(0xaa886644), C32e(0x82a87e54), C32e(0xe676ab3b), C32e(0x9e16830b), + C32e(0x4503ca8c), C32e(0x7b9529c7), C32e(0x6ed6d36b), C32e(0x44503c28), + C32e(0x8b5579a7), C32e(0x3d63e2bc), C32e(0x272c1d16), C32e(0x9a4176ad), + C32e(0x4dad3bdb), C32e(0xfac85664), C32e(0xd2e84e74), C32e(0x22281e14), + C32e(0x763fdb92), C32e(0x1e180a0c), C32e(0xb4906c48), C32e(0x376be4b8), + C32e(0xe7255d9f), C32e(0xb2616ebd), C32e(0x2a86ef43), C32e(0xf193a6c4), + C32e(0xe372a839), C32e(0xf762a431), C32e(0x59bd37d3), C32e(0x86ff8bf2), + C32e(0x56b132d5), C32e(0xc50d438b), C32e(0xebdc596e), C32e(0xc2afb7da), + C32e(0x8f028c01), C32e(0xac7964b1), C32e(0x6d23d29c), C32e(0x3b92e049), + C32e(0xc7abb4d8), C32e(0x1543faac), C32e(0x09fd07f3), C32e(0x6f8525cf), + C32e(0xea8fafca), C32e(0x89f38ef4), C32e(0x208ee947), C32e(0x28201810), + C32e(0x64ded56f), C32e(0x83fb88f0), C32e(0xb1946f4a), C32e(0x96b8725c), + C32e(0x6c702438), C32e(0x08aef157), C32e(0x52e6c773), C32e(0xf3355197), + C32e(0x658d23cb), C32e(0x84597ca1), C32e(0xbfcb9ce8), C32e(0x637c213e), + C32e(0x7c37dd96), C32e(0x7fc2dc61), C32e(0x911a860d), C32e(0x941e850f), + C32e(0xabdb90e0), C32e(0xc6f8427c), C32e(0x57e2c471), C32e(0xe583aacc), + C32e(0x733bd890), C32e(0x0f0c0506), C32e(0x03f501f7), C32e(0x3638121c), + C32e(0xfe9fa3c2), C32e(0xe1d45f6a), C32e(0x1047f9ae), C32e(0x6bd2d069), + C32e(0xa82e9117), C32e(0xe8295899), C32e(0x6974273a), C32e(0xd04eb927), + C32e(0x48a938d9), C32e(0x35cd13eb), C32e(0xce56b32b), C32e(0x55443322), + C32e(0xd6bfbbd2), C32e(0x904970a9), C32e(0x800e8907), C32e(0xf266a733), + C32e(0xc15ab62d), C32e(0x6678223c), C32e(0xad2a9215), C32e(0x608920c9), + C32e(0xdb154987), C32e(0x1a4fffaa), C32e(0x88a07850), C32e(0x8e517aa5), + C32e(0x8a068f03), C32e(0x13b2f859), C32e(0x9b128009), C32e(0x3934171a), + C32e(0x75cada65), C32e(0x53b531d7), C32e(0x5113c684), C32e(0xd3bbb8d0), + C32e(0x5e1fc382), C32e(0xcb52b029), C32e(0x99b4775a), C32e(0x333c111e), + C32e(0x46f6cb7b), C32e(0x1f4bfca8), C32e(0x61dad66d), C32e(0x4e583a2c) +}; + +static const sph_u32 T1up[] = { + C32e(0xc6c632f4), C32e(0xf8f86f97), C32e(0xeeee5eb0), C32e(0xf6f67a8c), + C32e(0xffffe817), C32e(0xd6d60adc), C32e(0xdede16c8), C32e(0x91916dfc), + C32e(0x606090f0), C32e(0x02020705), C32e(0xcece2ee0), C32e(0x5656d187), + C32e(0xe7e7cc2b), C32e(0xb5b513a6), C32e(0x4d4d7c31), C32e(0xecec59b5), + C32e(0x8f8f40cf), C32e(0x1f1fa3bc), C32e(0x898949c0), C32e(0xfafa6892), + C32e(0xefefd03f), C32e(0xb2b29426), C32e(0x8e8ece40), C32e(0xfbfbe61d), + C32e(0x41416e2f), C32e(0xb3b31aa9), C32e(0x5f5f431c), C32e(0x45456025), + C32e(0x2323f9da), C32e(0x53535102), C32e(0xe4e445a1), C32e(0x9b9b76ed), + C32e(0x7575285d), C32e(0xe1e1c524), C32e(0x3d3dd4e9), C32e(0x4c4cf2be), + C32e(0x6c6c82ee), C32e(0x7e7ebdc3), C32e(0xf5f5f306), C32e(0x838352d1), + C32e(0x68688ce4), C32e(0x51515607), C32e(0xd1d18d5c), C32e(0xf9f9e118), + C32e(0xe2e24cae), C32e(0xabab3e95), C32e(0x626297f5), C32e(0x2a2a6b41), + C32e(0x08081c14), C32e(0x959563f6), C32e(0x4646e9af), C32e(0x9d9d7fe2), + C32e(0x30304878), C32e(0x3737cff8), C32e(0x0a0a1b11), C32e(0x2f2febc4), + C32e(0x0e0e151b), C32e(0x24247e5a), C32e(0x1b1badb6), C32e(0xdfdf9847), + C32e(0xcdcda76a), C32e(0x4e4ef5bb), C32e(0x7f7f334c), C32e(0xeaea50ba), + C32e(0x12123f2d), C32e(0x1d1da4b9), C32e(0x5858c49c), C32e(0x34344672), + C32e(0x36364177), C32e(0xdcdc11cd), C32e(0xb4b49d29), C32e(0x5b5b4d16), + C32e(0xa4a4a501), C32e(0x7676a1d7), C32e(0xb7b714a3), C32e(0x7d7d3449), + C32e(0x5252df8d), C32e(0xdddd9f42), C32e(0x5e5ecd93), C32e(0x1313b1a2), + C32e(0xa6a6a204), C32e(0xb9b901b8), C32e(0x00000000), C32e(0xc1c1b574), + C32e(0x4040e0a0), C32e(0xe3e3c221), C32e(0x79793a43), C32e(0xb6b69a2c), + C32e(0xd4d40dd9), C32e(0x8d8d47ca), C32e(0x67671770), C32e(0x7272afdd), + C32e(0x9494ed79), C32e(0x9898ff67), C32e(0xb0b09323), C32e(0x85855bde), + C32e(0xbbbb06bd), C32e(0xc5c5bb7e), C32e(0x4f4f7b34), C32e(0xededd73a), + C32e(0x8686d254), C32e(0x9a9af862), C32e(0x666699ff), C32e(0x1111b6a7), + C32e(0x8a8ac04a), C32e(0xe9e9d930), C32e(0x04040e0a), C32e(0xfefe6698), + C32e(0xa0a0ab0b), C32e(0x7878b4cc), C32e(0x2525f0d5), C32e(0x4b4b753e), + C32e(0xa2a2ac0e), C32e(0x5d5d4419), C32e(0x8080db5b), C32e(0x05058085), + C32e(0x3f3fd3ec), C32e(0x2121fedf), C32e(0x7070a8d8), C32e(0xf1f1fd0c), + C32e(0x6363197a), C32e(0x77772f58), C32e(0xafaf309f), C32e(0x4242e7a5), + C32e(0x20207050), C32e(0xe5e5cb2e), C32e(0xfdfdef12), C32e(0xbfbf08b7), + C32e(0x818155d4), C32e(0x1818243c), C32e(0x2626795f), C32e(0xc3c3b271), + C32e(0xbebe8638), C32e(0x3535c8fd), C32e(0x8888c74f), C32e(0x2e2e654b), + C32e(0x93936af9), C32e(0x5555580d), C32e(0xfcfc619d), C32e(0x7a7ab3c9), + C32e(0xc8c827ef), C32e(0xbaba8832), C32e(0x32324f7d), C32e(0xe6e642a4), + C32e(0xc0c03bfb), C32e(0x1919aab3), C32e(0x9e9ef668), C32e(0xa3a32281), + C32e(0x4444eeaa), C32e(0x5454d682), C32e(0x3b3bdde6), C32e(0x0b0b959e), + C32e(0x8c8cc945), C32e(0xc7c7bc7b), C32e(0x6b6b056e), C32e(0x28286c44), + C32e(0xa7a72c8b), C32e(0xbcbc813d), C32e(0x16163127), C32e(0xadad379a), + C32e(0xdbdb964d), C32e(0x64649efa), C32e(0x7474a6d2), C32e(0x14143622), + C32e(0x9292e476), C32e(0x0c0c121e), C32e(0x4848fcb4), C32e(0xb8b88f37), + C32e(0x9f9f78e7), C32e(0xbdbd0fb2), C32e(0x4343692a), C32e(0xc4c435f1), + C32e(0x3939dae3), C32e(0x3131c6f7), C32e(0xd3d38a59), C32e(0xf2f27486), + C32e(0xd5d58356), C32e(0x8b8b4ec5), C32e(0x6e6e85eb), C32e(0xdada18c2), + C32e(0x01018e8f), C32e(0xb1b11dac), C32e(0x9c9cf16d), C32e(0x4949723b), + C32e(0xd8d81fc7), C32e(0xacacb915), C32e(0xf3f3fa09), C32e(0xcfcfa06f), + C32e(0xcaca20ea), C32e(0xf4f47d89), C32e(0x47476720), C32e(0x10103828), + C32e(0x6f6f0b64), C32e(0xf0f07383), C32e(0x4a4afbb1), C32e(0x5c5cca96), + C32e(0x3838546c), C32e(0x57575f08), C32e(0x73732152), C32e(0x979764f3), + C32e(0xcbcbae65), C32e(0xa1a12584), C32e(0xe8e857bf), C32e(0x3e3e5d63), + C32e(0x9696ea7c), C32e(0x61611e7f), C32e(0x0d0d9c91), C32e(0x0f0f9b94), + C32e(0xe0e04bab), C32e(0x7c7cbac6), C32e(0x71712657), C32e(0xcccc29e5), + C32e(0x9090e373), C32e(0x0606090f), C32e(0xf7f7f403), C32e(0x1c1c2a36), + C32e(0xc2c23cfe), C32e(0x6a6a8be1), C32e(0xaeaebe10), C32e(0x6969026b), + C32e(0x1717bfa8), C32e(0x999971e8), C32e(0x3a3a5369), C32e(0x2727f7d0), + C32e(0xd9d99148), C32e(0xebebde35), C32e(0x2b2be5ce), C32e(0x22227755), + C32e(0xd2d204d6), C32e(0xa9a93990), C32e(0x07078780), C32e(0x3333c1f2), + C32e(0x2d2decc1), C32e(0x3c3c5a66), C32e(0x1515b8ad), C32e(0xc9c9a960), + C32e(0x87875cdb), C32e(0xaaaab01a), C32e(0x5050d888), C32e(0xa5a52b8e), + C32e(0x0303898a), C32e(0x59594a13), C32e(0x0909929b), C32e(0x1a1a2339), + C32e(0x65651075), C32e(0xd7d78453), C32e(0x8484d551), C32e(0xd0d003d3), + C32e(0x8282dc5e), C32e(0x2929e2cb), C32e(0x5a5ac399), C32e(0x1e1e2d33), + C32e(0x7b7b3d46), C32e(0xa8a8b71f), C32e(0x6d6d0c61), C32e(0x2c2c624e) +}; + +static const sph_u32 T1dn[] = { + C32e(0xa5f497a5), C32e(0x8497eb84), C32e(0x99b0c799), C32e(0x8d8cf78d), + C32e(0x0d17e50d), C32e(0xbddcb7bd), C32e(0xb1c8a7b1), C32e(0x54fc3954), + C32e(0x50f0c050), C32e(0x03050403), C32e(0xa9e087a9), C32e(0x7d87ac7d), + C32e(0x192bd519), C32e(0x62a67162), C32e(0xe6319ae6), C32e(0x9ab5c39a), + C32e(0x45cf0545), C32e(0x9dbc3e9d), C32e(0x40c00940), C32e(0x8792ef87), + C32e(0x153fc515), C32e(0xeb267feb), C32e(0xc94007c9), C32e(0x0b1ded0b), + C32e(0xec2f82ec), C32e(0x67a97d67), C32e(0xfd1cbefd), C32e(0xea258aea), + C32e(0xbfda46bf), C32e(0xf702a6f7), C32e(0x96a1d396), C32e(0x5bed2d5b), + C32e(0xc25deac2), C32e(0x1c24d91c), C32e(0xaee97aae), C32e(0x6abe986a), + C32e(0x5aeed85a), C32e(0x41c3fc41), C32e(0x0206f102), C32e(0x4fd11d4f), + C32e(0x5ce4d05c), C32e(0xf407a2f4), C32e(0x345cb934), C32e(0x0818e908), + C32e(0x93aedf93), C32e(0x73954d73), C32e(0x53f5c453), C32e(0x3f41543f), + C32e(0x0c14100c), C32e(0x52f63152), C32e(0x65af8c65), C32e(0x5ee2215e), + C32e(0x28786028), C32e(0xa1f86ea1), C32e(0x0f11140f), C32e(0xb5c45eb5), + C32e(0x091b1c09), C32e(0x365a4836), C32e(0x9bb6369b), C32e(0x3d47a53d), + C32e(0x266a8126), C32e(0x69bb9c69), C32e(0xcd4cfecd), C32e(0x9fbacf9f), + C32e(0x1b2d241b), C32e(0x9eb93a9e), C32e(0x749cb074), C32e(0x2e72682e), + C32e(0x2d776c2d), C32e(0xb2cda3b2), C32e(0xee2973ee), C32e(0xfb16b6fb), + C32e(0xf60153f6), C32e(0x4dd7ec4d), C32e(0x61a37561), C32e(0xce49face), + C32e(0x7b8da47b), C32e(0x3e42a13e), C32e(0x7193bc71), C32e(0x97a22697), + C32e(0xf50457f5), C32e(0x68b86968), C32e(0x00000000), C32e(0x2c74992c), + C32e(0x60a08060), C32e(0x1f21dd1f), C32e(0xc843f2c8), C32e(0xed2c77ed), + C32e(0xbed9b3be), C32e(0x46ca0146), C32e(0xd970ced9), C32e(0x4bdde44b), + C32e(0xde7933de), C32e(0xd4672bd4), C32e(0xe8237be8), C32e(0x4ade114a), + C32e(0x6bbd6d6b), C32e(0x2a7e912a), C32e(0xe5349ee5), C32e(0x163ac116), + C32e(0xc55417c5), C32e(0xd7622fd7), C32e(0x55ffcc55), C32e(0x94a72294), + C32e(0xcf4a0fcf), C32e(0x1030c910), C32e(0x060a0806), C32e(0x8198e781), + C32e(0xf00b5bf0), C32e(0x44ccf044), C32e(0xbad54aba), C32e(0xe33e96e3), + C32e(0xf30e5ff3), C32e(0xfe19bafe), C32e(0xc05b1bc0), C32e(0x8a850a8a), + C32e(0xadec7ead), C32e(0xbcdf42bc), C32e(0x48d8e048), C32e(0x040cf904), + C32e(0xdf7ac6df), C32e(0xc158eec1), C32e(0x759f4575), C32e(0x63a58463), + C32e(0x30504030), C32e(0x1a2ed11a), C32e(0x0e12e10e), C32e(0x6db7656d), + C32e(0x4cd4194c), C32e(0x143c3014), C32e(0x355f4c35), C32e(0x2f719d2f), + C32e(0xe13867e1), C32e(0xa2fd6aa2), C32e(0xcc4f0bcc), C32e(0x394b5c39), + C32e(0x57f93d57), C32e(0xf20daaf2), C32e(0x829de382), C32e(0x47c9f447), + C32e(0xacef8bac), C32e(0xe7326fe7), C32e(0x2b7d642b), C32e(0x95a4d795), + C32e(0xa0fb9ba0), C32e(0x98b33298), C32e(0xd16827d1), C32e(0x7f815d7f), + C32e(0x66aa8866), C32e(0x7e82a87e), C32e(0xabe676ab), C32e(0x839e1683), + C32e(0xca4503ca), C32e(0x297b9529), C32e(0xd36ed6d3), C32e(0x3c44503c), + C32e(0x798b5579), C32e(0xe23d63e2), C32e(0x1d272c1d), C32e(0x769a4176), + C32e(0x3b4dad3b), C32e(0x56fac856), C32e(0x4ed2e84e), C32e(0x1e22281e), + C32e(0xdb763fdb), C32e(0x0a1e180a), C32e(0x6cb4906c), C32e(0xe4376be4), + C32e(0x5de7255d), C32e(0x6eb2616e), C32e(0xef2a86ef), C32e(0xa6f193a6), + C32e(0xa8e372a8), C32e(0xa4f762a4), C32e(0x3759bd37), C32e(0x8b86ff8b), + C32e(0x3256b132), C32e(0x43c50d43), C32e(0x59ebdc59), C32e(0xb7c2afb7), + C32e(0x8c8f028c), C32e(0x64ac7964), C32e(0xd26d23d2), C32e(0xe03b92e0), + C32e(0xb4c7abb4), C32e(0xfa1543fa), C32e(0x0709fd07), C32e(0x256f8525), + C32e(0xafea8faf), C32e(0x8e89f38e), C32e(0xe9208ee9), C32e(0x18282018), + C32e(0xd564ded5), C32e(0x8883fb88), C32e(0x6fb1946f), C32e(0x7296b872), + C32e(0x246c7024), C32e(0xf108aef1), C32e(0xc752e6c7), C32e(0x51f33551), + C32e(0x23658d23), C32e(0x7c84597c), C32e(0x9cbfcb9c), C32e(0x21637c21), + C32e(0xdd7c37dd), C32e(0xdc7fc2dc), C32e(0x86911a86), C32e(0x85941e85), + C32e(0x90abdb90), C32e(0x42c6f842), C32e(0xc457e2c4), C32e(0xaae583aa), + C32e(0xd8733bd8), C32e(0x050f0c05), C32e(0x0103f501), C32e(0x12363812), + C32e(0xa3fe9fa3), C32e(0x5fe1d45f), C32e(0xf91047f9), C32e(0xd06bd2d0), + C32e(0x91a82e91), C32e(0x58e82958), C32e(0x27697427), C32e(0xb9d04eb9), + C32e(0x3848a938), C32e(0x1335cd13), C32e(0xb3ce56b3), C32e(0x33554433), + C32e(0xbbd6bfbb), C32e(0x70904970), C32e(0x89800e89), C32e(0xa7f266a7), + C32e(0xb6c15ab6), C32e(0x22667822), C32e(0x92ad2a92), C32e(0x20608920), + C32e(0x49db1549), C32e(0xff1a4fff), C32e(0x7888a078), C32e(0x7a8e517a), + C32e(0x8f8a068f), C32e(0xf813b2f8), C32e(0x809b1280), C32e(0x17393417), + C32e(0xda75cada), C32e(0x3153b531), C32e(0xc65113c6), C32e(0xb8d3bbb8), + C32e(0xc35e1fc3), C32e(0xb0cb52b0), C32e(0x7799b477), C32e(0x11333c11), + C32e(0xcb46f6cb), C32e(0xfc1f4bfc), C32e(0xd661dad6), C32e(0x3a4e583a) +}; + +static const sph_u32 T2up[] = { + C32e(0xa5c6c632), C32e(0x84f8f86f), C32e(0x99eeee5e), C32e(0x8df6f67a), + C32e(0x0dffffe8), C32e(0xbdd6d60a), C32e(0xb1dede16), C32e(0x5491916d), + C32e(0x50606090), C32e(0x03020207), C32e(0xa9cece2e), C32e(0x7d5656d1), + C32e(0x19e7e7cc), C32e(0x62b5b513), C32e(0xe64d4d7c), C32e(0x9aecec59), + C32e(0x458f8f40), C32e(0x9d1f1fa3), C32e(0x40898949), C32e(0x87fafa68), + C32e(0x15efefd0), C32e(0xebb2b294), C32e(0xc98e8ece), C32e(0x0bfbfbe6), + C32e(0xec41416e), C32e(0x67b3b31a), C32e(0xfd5f5f43), C32e(0xea454560), + C32e(0xbf2323f9), C32e(0xf7535351), C32e(0x96e4e445), C32e(0x5b9b9b76), + C32e(0xc2757528), C32e(0x1ce1e1c5), C32e(0xae3d3dd4), C32e(0x6a4c4cf2), + C32e(0x5a6c6c82), C32e(0x417e7ebd), C32e(0x02f5f5f3), C32e(0x4f838352), + C32e(0x5c68688c), C32e(0xf4515156), C32e(0x34d1d18d), C32e(0x08f9f9e1), + C32e(0x93e2e24c), C32e(0x73abab3e), C32e(0x53626297), C32e(0x3f2a2a6b), + C32e(0x0c08081c), C32e(0x52959563), C32e(0x654646e9), C32e(0x5e9d9d7f), + C32e(0x28303048), C32e(0xa13737cf), C32e(0x0f0a0a1b), C32e(0xb52f2feb), + C32e(0x090e0e15), C32e(0x3624247e), C32e(0x9b1b1bad), C32e(0x3ddfdf98), + C32e(0x26cdcda7), C32e(0x694e4ef5), C32e(0xcd7f7f33), C32e(0x9feaea50), + C32e(0x1b12123f), C32e(0x9e1d1da4), C32e(0x745858c4), C32e(0x2e343446), + C32e(0x2d363641), C32e(0xb2dcdc11), C32e(0xeeb4b49d), C32e(0xfb5b5b4d), + C32e(0xf6a4a4a5), C32e(0x4d7676a1), C32e(0x61b7b714), C32e(0xce7d7d34), + C32e(0x7b5252df), C32e(0x3edddd9f), C32e(0x715e5ecd), C32e(0x971313b1), + C32e(0xf5a6a6a2), C32e(0x68b9b901), C32e(0x00000000), C32e(0x2cc1c1b5), + C32e(0x604040e0), C32e(0x1fe3e3c2), C32e(0xc879793a), C32e(0xedb6b69a), + C32e(0xbed4d40d), C32e(0x468d8d47), C32e(0xd9676717), C32e(0x4b7272af), + C32e(0xde9494ed), C32e(0xd49898ff), C32e(0xe8b0b093), C32e(0x4a85855b), + C32e(0x6bbbbb06), C32e(0x2ac5c5bb), C32e(0xe54f4f7b), C32e(0x16ededd7), + C32e(0xc58686d2), C32e(0xd79a9af8), C32e(0x55666699), C32e(0x941111b6), + C32e(0xcf8a8ac0), C32e(0x10e9e9d9), C32e(0x0604040e), C32e(0x81fefe66), + C32e(0xf0a0a0ab), C32e(0x447878b4), C32e(0xba2525f0), C32e(0xe34b4b75), + C32e(0xf3a2a2ac), C32e(0xfe5d5d44), C32e(0xc08080db), C32e(0x8a050580), + C32e(0xad3f3fd3), C32e(0xbc2121fe), C32e(0x487070a8), C32e(0x04f1f1fd), + C32e(0xdf636319), C32e(0xc177772f), C32e(0x75afaf30), C32e(0x634242e7), + C32e(0x30202070), C32e(0x1ae5e5cb), C32e(0x0efdfdef), C32e(0x6dbfbf08), + C32e(0x4c818155), C32e(0x14181824), C32e(0x35262679), C32e(0x2fc3c3b2), + C32e(0xe1bebe86), C32e(0xa23535c8), C32e(0xcc8888c7), C32e(0x392e2e65), + C32e(0x5793936a), C32e(0xf2555558), C32e(0x82fcfc61), C32e(0x477a7ab3), + C32e(0xacc8c827), C32e(0xe7baba88), C32e(0x2b32324f), C32e(0x95e6e642), + C32e(0xa0c0c03b), C32e(0x981919aa), C32e(0xd19e9ef6), C32e(0x7fa3a322), + C32e(0x664444ee), C32e(0x7e5454d6), C32e(0xab3b3bdd), C32e(0x830b0b95), + C32e(0xca8c8cc9), C32e(0x29c7c7bc), C32e(0xd36b6b05), C32e(0x3c28286c), + C32e(0x79a7a72c), C32e(0xe2bcbc81), C32e(0x1d161631), C32e(0x76adad37), + C32e(0x3bdbdb96), C32e(0x5664649e), C32e(0x4e7474a6), C32e(0x1e141436), + C32e(0xdb9292e4), C32e(0x0a0c0c12), C32e(0x6c4848fc), C32e(0xe4b8b88f), + C32e(0x5d9f9f78), C32e(0x6ebdbd0f), C32e(0xef434369), C32e(0xa6c4c435), + C32e(0xa83939da), C32e(0xa43131c6), C32e(0x37d3d38a), C32e(0x8bf2f274), + C32e(0x32d5d583), C32e(0x438b8b4e), C32e(0x596e6e85), C32e(0xb7dada18), + C32e(0x8c01018e), C32e(0x64b1b11d), C32e(0xd29c9cf1), C32e(0xe0494972), + C32e(0xb4d8d81f), C32e(0xfaacacb9), C32e(0x07f3f3fa), C32e(0x25cfcfa0), + C32e(0xafcaca20), C32e(0x8ef4f47d), C32e(0xe9474767), C32e(0x18101038), + C32e(0xd56f6f0b), C32e(0x88f0f073), C32e(0x6f4a4afb), C32e(0x725c5cca), + C32e(0x24383854), C32e(0xf157575f), C32e(0xc7737321), C32e(0x51979764), + C32e(0x23cbcbae), C32e(0x7ca1a125), C32e(0x9ce8e857), C32e(0x213e3e5d), + C32e(0xdd9696ea), C32e(0xdc61611e), C32e(0x860d0d9c), C32e(0x850f0f9b), + C32e(0x90e0e04b), C32e(0x427c7cba), C32e(0xc4717126), C32e(0xaacccc29), + C32e(0xd89090e3), C32e(0x05060609), C32e(0x01f7f7f4), C32e(0x121c1c2a), + C32e(0xa3c2c23c), C32e(0x5f6a6a8b), C32e(0xf9aeaebe), C32e(0xd0696902), + C32e(0x911717bf), C32e(0x58999971), C32e(0x273a3a53), C32e(0xb92727f7), + C32e(0x38d9d991), C32e(0x13ebebde), C32e(0xb32b2be5), C32e(0x33222277), + C32e(0xbbd2d204), C32e(0x70a9a939), C32e(0x89070787), C32e(0xa73333c1), + C32e(0xb62d2dec), C32e(0x223c3c5a), C32e(0x921515b8), C32e(0x20c9c9a9), + C32e(0x4987875c), C32e(0xffaaaab0), C32e(0x785050d8), C32e(0x7aa5a52b), + C32e(0x8f030389), C32e(0xf859594a), C32e(0x80090992), C32e(0x171a1a23), + C32e(0xda656510), C32e(0x31d7d784), C32e(0xc68484d5), C32e(0xb8d0d003), + C32e(0xc38282dc), C32e(0xb02929e2), C32e(0x775a5ac3), C32e(0x111e1e2d), + C32e(0xcb7b7b3d), C32e(0xfca8a8b7), C32e(0xd66d6d0c), C32e(0x3a2c2c62) +}; + +static const sph_u32 T2dn[] = { + C32e(0xf4a5f497), C32e(0x978497eb), C32e(0xb099b0c7), C32e(0x8c8d8cf7), + C32e(0x170d17e5), C32e(0xdcbddcb7), C32e(0xc8b1c8a7), C32e(0xfc54fc39), + C32e(0xf050f0c0), C32e(0x05030504), C32e(0xe0a9e087), C32e(0x877d87ac), + C32e(0x2b192bd5), C32e(0xa662a671), C32e(0x31e6319a), C32e(0xb59ab5c3), + C32e(0xcf45cf05), C32e(0xbc9dbc3e), C32e(0xc040c009), C32e(0x928792ef), + C32e(0x3f153fc5), C32e(0x26eb267f), C32e(0x40c94007), C32e(0x1d0b1ded), + C32e(0x2fec2f82), C32e(0xa967a97d), C32e(0x1cfd1cbe), C32e(0x25ea258a), + C32e(0xdabfda46), C32e(0x02f702a6), C32e(0xa196a1d3), C32e(0xed5bed2d), + C32e(0x5dc25dea), C32e(0x241c24d9), C32e(0xe9aee97a), C32e(0xbe6abe98), + C32e(0xee5aeed8), C32e(0xc341c3fc), C32e(0x060206f1), C32e(0xd14fd11d), + C32e(0xe45ce4d0), C32e(0x07f407a2), C32e(0x5c345cb9), C32e(0x180818e9), + C32e(0xae93aedf), C32e(0x9573954d), C32e(0xf553f5c4), C32e(0x413f4154), + C32e(0x140c1410), C32e(0xf652f631), C32e(0xaf65af8c), C32e(0xe25ee221), + C32e(0x78287860), C32e(0xf8a1f86e), C32e(0x110f1114), C32e(0xc4b5c45e), + C32e(0x1b091b1c), C32e(0x5a365a48), C32e(0xb69bb636), C32e(0x473d47a5), + C32e(0x6a266a81), C32e(0xbb69bb9c), C32e(0x4ccd4cfe), C32e(0xba9fbacf), + C32e(0x2d1b2d24), C32e(0xb99eb93a), C32e(0x9c749cb0), C32e(0x722e7268), + C32e(0x772d776c), C32e(0xcdb2cda3), C32e(0x29ee2973), C32e(0x16fb16b6), + C32e(0x01f60153), C32e(0xd74dd7ec), C32e(0xa361a375), C32e(0x49ce49fa), + C32e(0x8d7b8da4), C32e(0x423e42a1), C32e(0x937193bc), C32e(0xa297a226), + C32e(0x04f50457), C32e(0xb868b869), C32e(0x00000000), C32e(0x742c7499), + C32e(0xa060a080), C32e(0x211f21dd), C32e(0x43c843f2), C32e(0x2ced2c77), + C32e(0xd9bed9b3), C32e(0xca46ca01), C32e(0x70d970ce), C32e(0xdd4bdde4), + C32e(0x79de7933), C32e(0x67d4672b), C32e(0x23e8237b), C32e(0xde4ade11), + C32e(0xbd6bbd6d), C32e(0x7e2a7e91), C32e(0x34e5349e), C32e(0x3a163ac1), + C32e(0x54c55417), C32e(0x62d7622f), C32e(0xff55ffcc), C32e(0xa794a722), + C32e(0x4acf4a0f), C32e(0x301030c9), C32e(0x0a060a08), C32e(0x988198e7), + C32e(0x0bf00b5b), C32e(0xcc44ccf0), C32e(0xd5bad54a), C32e(0x3ee33e96), + C32e(0x0ef30e5f), C32e(0x19fe19ba), C32e(0x5bc05b1b), C32e(0x858a850a), + C32e(0xecadec7e), C32e(0xdfbcdf42), C32e(0xd848d8e0), C32e(0x0c040cf9), + C32e(0x7adf7ac6), C32e(0x58c158ee), C32e(0x9f759f45), C32e(0xa563a584), + C32e(0x50305040), C32e(0x2e1a2ed1), C32e(0x120e12e1), C32e(0xb76db765), + C32e(0xd44cd419), C32e(0x3c143c30), C32e(0x5f355f4c), C32e(0x712f719d), + C32e(0x38e13867), C32e(0xfda2fd6a), C32e(0x4fcc4f0b), C32e(0x4b394b5c), + C32e(0xf957f93d), C32e(0x0df20daa), C32e(0x9d829de3), C32e(0xc947c9f4), + C32e(0xefacef8b), C32e(0x32e7326f), C32e(0x7d2b7d64), C32e(0xa495a4d7), + C32e(0xfba0fb9b), C32e(0xb398b332), C32e(0x68d16827), C32e(0x817f815d), + C32e(0xaa66aa88), C32e(0x827e82a8), C32e(0xe6abe676), C32e(0x9e839e16), + C32e(0x45ca4503), C32e(0x7b297b95), C32e(0x6ed36ed6), C32e(0x443c4450), + C32e(0x8b798b55), C32e(0x3de23d63), C32e(0x271d272c), C32e(0x9a769a41), + C32e(0x4d3b4dad), C32e(0xfa56fac8), C32e(0xd24ed2e8), C32e(0x221e2228), + C32e(0x76db763f), C32e(0x1e0a1e18), C32e(0xb46cb490), C32e(0x37e4376b), + C32e(0xe75de725), C32e(0xb26eb261), C32e(0x2aef2a86), C32e(0xf1a6f193), + C32e(0xe3a8e372), C32e(0xf7a4f762), C32e(0x593759bd), C32e(0x868b86ff), + C32e(0x563256b1), C32e(0xc543c50d), C32e(0xeb59ebdc), C32e(0xc2b7c2af), + C32e(0x8f8c8f02), C32e(0xac64ac79), C32e(0x6dd26d23), C32e(0x3be03b92), + C32e(0xc7b4c7ab), C32e(0x15fa1543), C32e(0x090709fd), C32e(0x6f256f85), + C32e(0xeaafea8f), C32e(0x898e89f3), C32e(0x20e9208e), C32e(0x28182820), + C32e(0x64d564de), C32e(0x838883fb), C32e(0xb16fb194), C32e(0x967296b8), + C32e(0x6c246c70), C32e(0x08f108ae), C32e(0x52c752e6), C32e(0xf351f335), + C32e(0x6523658d), C32e(0x847c8459), C32e(0xbf9cbfcb), C32e(0x6321637c), + C32e(0x7cdd7c37), C32e(0x7fdc7fc2), C32e(0x9186911a), C32e(0x9485941e), + C32e(0xab90abdb), C32e(0xc642c6f8), C32e(0x57c457e2), C32e(0xe5aae583), + C32e(0x73d8733b), C32e(0x0f050f0c), C32e(0x030103f5), C32e(0x36123638), + C32e(0xfea3fe9f), C32e(0xe15fe1d4), C32e(0x10f91047), C32e(0x6bd06bd2), + C32e(0xa891a82e), C32e(0xe858e829), C32e(0x69276974), C32e(0xd0b9d04e), + C32e(0x483848a9), C32e(0x351335cd), C32e(0xceb3ce56), C32e(0x55335544), + C32e(0xd6bbd6bf), C32e(0x90709049), C32e(0x8089800e), C32e(0xf2a7f266), + C32e(0xc1b6c15a), C32e(0x66226678), C32e(0xad92ad2a), C32e(0x60206089), + C32e(0xdb49db15), C32e(0x1aff1a4f), C32e(0x887888a0), C32e(0x8e7a8e51), + C32e(0x8a8f8a06), C32e(0x13f813b2), C32e(0x9b809b12), C32e(0x39173934), + C32e(0x75da75ca), C32e(0x533153b5), C32e(0x51c65113), C32e(0xd3b8d3bb), + C32e(0x5ec35e1f), C32e(0xcbb0cb52), C32e(0x997799b4), C32e(0x3311333c), + C32e(0x46cb46f6), C32e(0x1ffc1f4b), C32e(0x61d661da), C32e(0x4e3a4e58) +}; + +static const sph_u32 T3up[] = { + C32e(0x97a5c6c6), C32e(0xeb84f8f8), C32e(0xc799eeee), C32e(0xf78df6f6), + C32e(0xe50dffff), C32e(0xb7bdd6d6), C32e(0xa7b1dede), C32e(0x39549191), + C32e(0xc0506060), C32e(0x04030202), C32e(0x87a9cece), C32e(0xac7d5656), + C32e(0xd519e7e7), C32e(0x7162b5b5), C32e(0x9ae64d4d), C32e(0xc39aecec), + C32e(0x05458f8f), C32e(0x3e9d1f1f), C32e(0x09408989), C32e(0xef87fafa), + C32e(0xc515efef), C32e(0x7febb2b2), C32e(0x07c98e8e), C32e(0xed0bfbfb), + C32e(0x82ec4141), C32e(0x7d67b3b3), C32e(0xbefd5f5f), C32e(0x8aea4545), + C32e(0x46bf2323), C32e(0xa6f75353), C32e(0xd396e4e4), C32e(0x2d5b9b9b), + C32e(0xeac27575), C32e(0xd91ce1e1), C32e(0x7aae3d3d), C32e(0x986a4c4c), + C32e(0xd85a6c6c), C32e(0xfc417e7e), C32e(0xf102f5f5), C32e(0x1d4f8383), + C32e(0xd05c6868), C32e(0xa2f45151), C32e(0xb934d1d1), C32e(0xe908f9f9), + C32e(0xdf93e2e2), C32e(0x4d73abab), C32e(0xc4536262), C32e(0x543f2a2a), + C32e(0x100c0808), C32e(0x31529595), C32e(0x8c654646), C32e(0x215e9d9d), + C32e(0x60283030), C32e(0x6ea13737), C32e(0x140f0a0a), C32e(0x5eb52f2f), + C32e(0x1c090e0e), C32e(0x48362424), C32e(0x369b1b1b), C32e(0xa53ddfdf), + C32e(0x8126cdcd), C32e(0x9c694e4e), C32e(0xfecd7f7f), C32e(0xcf9feaea), + C32e(0x241b1212), C32e(0x3a9e1d1d), C32e(0xb0745858), C32e(0x682e3434), + C32e(0x6c2d3636), C32e(0xa3b2dcdc), C32e(0x73eeb4b4), C32e(0xb6fb5b5b), + C32e(0x53f6a4a4), C32e(0xec4d7676), C32e(0x7561b7b7), C32e(0xface7d7d), + C32e(0xa47b5252), C32e(0xa13edddd), C32e(0xbc715e5e), C32e(0x26971313), + C32e(0x57f5a6a6), C32e(0x6968b9b9), C32e(0x00000000), C32e(0x992cc1c1), + C32e(0x80604040), C32e(0xdd1fe3e3), C32e(0xf2c87979), C32e(0x77edb6b6), + C32e(0xb3bed4d4), C32e(0x01468d8d), C32e(0xced96767), C32e(0xe44b7272), + C32e(0x33de9494), C32e(0x2bd49898), C32e(0x7be8b0b0), C32e(0x114a8585), + C32e(0x6d6bbbbb), C32e(0x912ac5c5), C32e(0x9ee54f4f), C32e(0xc116eded), + C32e(0x17c58686), C32e(0x2fd79a9a), C32e(0xcc556666), C32e(0x22941111), + C32e(0x0fcf8a8a), C32e(0xc910e9e9), C32e(0x08060404), C32e(0xe781fefe), + C32e(0x5bf0a0a0), C32e(0xf0447878), C32e(0x4aba2525), C32e(0x96e34b4b), + C32e(0x5ff3a2a2), C32e(0xbafe5d5d), C32e(0x1bc08080), C32e(0x0a8a0505), + C32e(0x7ead3f3f), C32e(0x42bc2121), C32e(0xe0487070), C32e(0xf904f1f1), + C32e(0xc6df6363), C32e(0xeec17777), C32e(0x4575afaf), C32e(0x84634242), + C32e(0x40302020), C32e(0xd11ae5e5), C32e(0xe10efdfd), C32e(0x656dbfbf), + C32e(0x194c8181), C32e(0x30141818), C32e(0x4c352626), C32e(0x9d2fc3c3), + C32e(0x67e1bebe), C32e(0x6aa23535), C32e(0x0bcc8888), C32e(0x5c392e2e), + C32e(0x3d579393), C32e(0xaaf25555), C32e(0xe382fcfc), C32e(0xf4477a7a), + C32e(0x8bacc8c8), C32e(0x6fe7baba), C32e(0x642b3232), C32e(0xd795e6e6), + C32e(0x9ba0c0c0), C32e(0x32981919), C32e(0x27d19e9e), C32e(0x5d7fa3a3), + C32e(0x88664444), C32e(0xa87e5454), C32e(0x76ab3b3b), C32e(0x16830b0b), + C32e(0x03ca8c8c), C32e(0x9529c7c7), C32e(0xd6d36b6b), C32e(0x503c2828), + C32e(0x5579a7a7), C32e(0x63e2bcbc), C32e(0x2c1d1616), C32e(0x4176adad), + C32e(0xad3bdbdb), C32e(0xc8566464), C32e(0xe84e7474), C32e(0x281e1414), + C32e(0x3fdb9292), C32e(0x180a0c0c), C32e(0x906c4848), C32e(0x6be4b8b8), + C32e(0x255d9f9f), C32e(0x616ebdbd), C32e(0x86ef4343), C32e(0x93a6c4c4), + C32e(0x72a83939), C32e(0x62a43131), C32e(0xbd37d3d3), C32e(0xff8bf2f2), + C32e(0xb132d5d5), C32e(0x0d438b8b), C32e(0xdc596e6e), C32e(0xafb7dada), + C32e(0x028c0101), C32e(0x7964b1b1), C32e(0x23d29c9c), C32e(0x92e04949), + C32e(0xabb4d8d8), C32e(0x43faacac), C32e(0xfd07f3f3), C32e(0x8525cfcf), + C32e(0x8fafcaca), C32e(0xf38ef4f4), C32e(0x8ee94747), C32e(0x20181010), + C32e(0xded56f6f), C32e(0xfb88f0f0), C32e(0x946f4a4a), C32e(0xb8725c5c), + C32e(0x70243838), C32e(0xaef15757), C32e(0xe6c77373), C32e(0x35519797), + C32e(0x8d23cbcb), C32e(0x597ca1a1), C32e(0xcb9ce8e8), C32e(0x7c213e3e), + C32e(0x37dd9696), C32e(0xc2dc6161), C32e(0x1a860d0d), C32e(0x1e850f0f), + C32e(0xdb90e0e0), C32e(0xf8427c7c), C32e(0xe2c47171), C32e(0x83aacccc), + C32e(0x3bd89090), C32e(0x0c050606), C32e(0xf501f7f7), C32e(0x38121c1c), + C32e(0x9fa3c2c2), C32e(0xd45f6a6a), C32e(0x47f9aeae), C32e(0xd2d06969), + C32e(0x2e911717), C32e(0x29589999), C32e(0x74273a3a), C32e(0x4eb92727), + C32e(0xa938d9d9), C32e(0xcd13ebeb), C32e(0x56b32b2b), C32e(0x44332222), + C32e(0xbfbbd2d2), C32e(0x4970a9a9), C32e(0x0e890707), C32e(0x66a73333), + C32e(0x5ab62d2d), C32e(0x78223c3c), C32e(0x2a921515), C32e(0x8920c9c9), + C32e(0x15498787), C32e(0x4fffaaaa), C32e(0xa0785050), C32e(0x517aa5a5), + C32e(0x068f0303), C32e(0xb2f85959), C32e(0x12800909), C32e(0x34171a1a), + C32e(0xcada6565), C32e(0xb531d7d7), C32e(0x13c68484), C32e(0xbbb8d0d0), + C32e(0x1fc38282), C32e(0x52b02929), C32e(0xb4775a5a), C32e(0x3c111e1e), + C32e(0xf6cb7b7b), C32e(0x4bfca8a8), C32e(0xdad66d6d), C32e(0x583a2c2c) +}; + +static const sph_u32 T3dn[] = { + C32e(0x32f4a5f4), C32e(0x6f978497), C32e(0x5eb099b0), C32e(0x7a8c8d8c), + C32e(0xe8170d17), C32e(0x0adcbddc), C32e(0x16c8b1c8), C32e(0x6dfc54fc), + C32e(0x90f050f0), C32e(0x07050305), C32e(0x2ee0a9e0), C32e(0xd1877d87), + C32e(0xcc2b192b), C32e(0x13a662a6), C32e(0x7c31e631), C32e(0x59b59ab5), + C32e(0x40cf45cf), C32e(0xa3bc9dbc), C32e(0x49c040c0), C32e(0x68928792), + C32e(0xd03f153f), C32e(0x9426eb26), C32e(0xce40c940), C32e(0xe61d0b1d), + C32e(0x6e2fec2f), C32e(0x1aa967a9), C32e(0x431cfd1c), C32e(0x6025ea25), + C32e(0xf9dabfda), C32e(0x5102f702), C32e(0x45a196a1), C32e(0x76ed5bed), + C32e(0x285dc25d), C32e(0xc5241c24), C32e(0xd4e9aee9), C32e(0xf2be6abe), + C32e(0x82ee5aee), C32e(0xbdc341c3), C32e(0xf3060206), C32e(0x52d14fd1), + C32e(0x8ce45ce4), C32e(0x5607f407), C32e(0x8d5c345c), C32e(0xe1180818), + C32e(0x4cae93ae), C32e(0x3e957395), C32e(0x97f553f5), C32e(0x6b413f41), + C32e(0x1c140c14), C32e(0x63f652f6), C32e(0xe9af65af), C32e(0x7fe25ee2), + C32e(0x48782878), C32e(0xcff8a1f8), C32e(0x1b110f11), C32e(0xebc4b5c4), + C32e(0x151b091b), C32e(0x7e5a365a), C32e(0xadb69bb6), C32e(0x98473d47), + C32e(0xa76a266a), C32e(0xf5bb69bb), C32e(0x334ccd4c), C32e(0x50ba9fba), + C32e(0x3f2d1b2d), C32e(0xa4b99eb9), C32e(0xc49c749c), C32e(0x46722e72), + C32e(0x41772d77), C32e(0x11cdb2cd), C32e(0x9d29ee29), C32e(0x4d16fb16), + C32e(0xa501f601), C32e(0xa1d74dd7), C32e(0x14a361a3), C32e(0x3449ce49), + C32e(0xdf8d7b8d), C32e(0x9f423e42), C32e(0xcd937193), C32e(0xb1a297a2), + C32e(0xa204f504), C32e(0x01b868b8), C32e(0x00000000), C32e(0xb5742c74), + C32e(0xe0a060a0), C32e(0xc2211f21), C32e(0x3a43c843), C32e(0x9a2ced2c), + C32e(0x0dd9bed9), C32e(0x47ca46ca), C32e(0x1770d970), C32e(0xafdd4bdd), + C32e(0xed79de79), C32e(0xff67d467), C32e(0x9323e823), C32e(0x5bde4ade), + C32e(0x06bd6bbd), C32e(0xbb7e2a7e), C32e(0x7b34e534), C32e(0xd73a163a), + C32e(0xd254c554), C32e(0xf862d762), C32e(0x99ff55ff), C32e(0xb6a794a7), + C32e(0xc04acf4a), C32e(0xd9301030), C32e(0x0e0a060a), C32e(0x66988198), + C32e(0xab0bf00b), C32e(0xb4cc44cc), C32e(0xf0d5bad5), C32e(0x753ee33e), + C32e(0xac0ef30e), C32e(0x4419fe19), C32e(0xdb5bc05b), C32e(0x80858a85), + C32e(0xd3ecadec), C32e(0xfedfbcdf), C32e(0xa8d848d8), C32e(0xfd0c040c), + C32e(0x197adf7a), C32e(0x2f58c158), C32e(0x309f759f), C32e(0xe7a563a5), + C32e(0x70503050), C32e(0xcb2e1a2e), C32e(0xef120e12), C32e(0x08b76db7), + C32e(0x55d44cd4), C32e(0x243c143c), C32e(0x795f355f), C32e(0xb2712f71), + C32e(0x8638e138), C32e(0xc8fda2fd), C32e(0xc74fcc4f), C32e(0x654b394b), + C32e(0x6af957f9), C32e(0x580df20d), C32e(0x619d829d), C32e(0xb3c947c9), + C32e(0x27efacef), C32e(0x8832e732), C32e(0x4f7d2b7d), C32e(0x42a495a4), + C32e(0x3bfba0fb), C32e(0xaab398b3), C32e(0xf668d168), C32e(0x22817f81), + C32e(0xeeaa66aa), C32e(0xd6827e82), C32e(0xdde6abe6), C32e(0x959e839e), + C32e(0xc945ca45), C32e(0xbc7b297b), C32e(0x056ed36e), C32e(0x6c443c44), + C32e(0x2c8b798b), C32e(0x813de23d), C32e(0x31271d27), C32e(0x379a769a), + C32e(0x964d3b4d), C32e(0x9efa56fa), C32e(0xa6d24ed2), C32e(0x36221e22), + C32e(0xe476db76), C32e(0x121e0a1e), C32e(0xfcb46cb4), C32e(0x8f37e437), + C32e(0x78e75de7), C32e(0x0fb26eb2), C32e(0x692aef2a), C32e(0x35f1a6f1), + C32e(0xdae3a8e3), C32e(0xc6f7a4f7), C32e(0x8a593759), C32e(0x74868b86), + C32e(0x83563256), C32e(0x4ec543c5), C32e(0x85eb59eb), C32e(0x18c2b7c2), + C32e(0x8e8f8c8f), C32e(0x1dac64ac), C32e(0xf16dd26d), C32e(0x723be03b), + C32e(0x1fc7b4c7), C32e(0xb915fa15), C32e(0xfa090709), C32e(0xa06f256f), + C32e(0x20eaafea), C32e(0x7d898e89), C32e(0x6720e920), C32e(0x38281828), + C32e(0x0b64d564), C32e(0x73838883), C32e(0xfbb16fb1), C32e(0xca967296), + C32e(0x546c246c), C32e(0x5f08f108), C32e(0x2152c752), C32e(0x64f351f3), + C32e(0xae652365), C32e(0x25847c84), C32e(0x57bf9cbf), C32e(0x5d632163), + C32e(0xea7cdd7c), C32e(0x1e7fdc7f), C32e(0x9c918691), C32e(0x9b948594), + C32e(0x4bab90ab), C32e(0xbac642c6), C32e(0x2657c457), C32e(0x29e5aae5), + C32e(0xe373d873), C32e(0x090f050f), C32e(0xf4030103), C32e(0x2a361236), + C32e(0x3cfea3fe), C32e(0x8be15fe1), C32e(0xbe10f910), C32e(0x026bd06b), + C32e(0xbfa891a8), C32e(0x71e858e8), C32e(0x53692769), C32e(0xf7d0b9d0), + C32e(0x91483848), C32e(0xde351335), C32e(0xe5ceb3ce), C32e(0x77553355), + C32e(0x04d6bbd6), C32e(0x39907090), C32e(0x87808980), C32e(0xc1f2a7f2), + C32e(0xecc1b6c1), C32e(0x5a662266), C32e(0xb8ad92ad), C32e(0xa9602060), + C32e(0x5cdb49db), C32e(0xb01aff1a), C32e(0xd8887888), C32e(0x2b8e7a8e), + C32e(0x898a8f8a), C32e(0x4a13f813), C32e(0x929b809b), C32e(0x23391739), + C32e(0x1075da75), C32e(0x84533153), C32e(0xd551c651), C32e(0x03d3b8d3), + C32e(0xdc5ec35e), C32e(0xe2cbb0cb), C32e(0xc3997799), C32e(0x2d331133), + C32e(0x3d46cb46), C32e(0xb71ffc1f), C32e(0x0c61d661), C32e(0x624e3a4e) +}; + +#define DECL_STATE_SMALL \ + sph_u32 H[16]; + +#define READ_STATE_SMALL(sc) do { \ + memcpy(H, (sc)->state.narrow, sizeof H); \ + } while (0) + +#define WRITE_STATE_SMALL(sc) do { \ + memcpy((sc)->state.narrow, H, sizeof H); \ + } while (0) + +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d0] = T0up[B32_0(a[b0])] \ + ^ T1up[B32_1(a[b1])] \ + ^ T2up[B32_2(a[b2])] \ + ^ T3up[B32_3(a[b3])] \ + ^ T0dn[B32_0(a[b4])] \ + ^ T1dn[B32_1(a[b5])] \ + ^ T2dn[B32_2(a[b6])] \ + ^ T3dn[B32_3(a[b7])]; \ + t[d1] = T0dn[B32_0(a[b0])] \ + ^ T1dn[B32_1(a[b1])] \ + ^ T2dn[B32_2(a[b2])] \ + ^ T3dn[B32_3(a[b3])] \ + ^ T0up[B32_0(a[b4])] \ + ^ T1up[B32_1(a[b5])] \ + ^ T2up[B32_2(a[b6])] \ + ^ T3up[B32_3(a[b7])]; \ + } while (0) + +#define ROUND_SMALL_P(a, r) do { \ + sph_u32 t[16]; \ + a[0x0] ^= PC32up(0x00, r); \ + a[0x1] ^= PC32dn(0x00, r); \ + a[0x2] ^= PC32up(0x10, r); \ + a[0x3] ^= PC32dn(0x10, r); \ + a[0x4] ^= PC32up(0x20, r); \ + a[0x5] ^= PC32dn(0x20, r); \ + a[0x6] ^= PC32up(0x30, r); \ + a[0x7] ^= PC32dn(0x30, r); \ + a[0x8] ^= PC32up(0x40, r); \ + a[0x9] ^= PC32dn(0x40, r); \ + a[0xA] ^= PC32up(0x50, r); \ + a[0xB] ^= PC32dn(0x50, r); \ + a[0xC] ^= PC32up(0x60, r); \ + a[0xD] ^= PC32dn(0x60, r); \ + a[0xE] ^= PC32up(0x70, r); \ + a[0xF] ^= PC32dn(0x70, r); \ + RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF); \ + RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1); \ + RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3); \ + RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5); \ + RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7); \ + RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9); \ + RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB); \ + RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD); \ + memcpy(a, t, sizeof t); \ + } while (0) + +#define ROUND_SMALL_Q(a, r) do { \ + sph_u32 t[16]; \ + a[0x0] ^= QC32up(0x00, r); \ + a[0x1] ^= QC32dn(0x00, r); \ + a[0x2] ^= QC32up(0x10, r); \ + a[0x3] ^= QC32dn(0x10, r); \ + a[0x4] ^= QC32up(0x20, r); \ + a[0x5] ^= QC32dn(0x20, r); \ + a[0x6] ^= QC32up(0x30, r); \ + a[0x7] ^= QC32dn(0x30, r); \ + a[0x8] ^= QC32up(0x40, r); \ + a[0x9] ^= QC32dn(0x40, r); \ + a[0xA] ^= QC32up(0x50, r); \ + a[0xB] ^= QC32dn(0x50, r); \ + a[0xC] ^= QC32up(0x60, r); \ + a[0xD] ^= QC32dn(0x60, r); \ + a[0xE] ^= QC32up(0x70, r); \ + a[0xF] ^= QC32dn(0x70, r); \ + RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD); \ + RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF); \ + RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1); \ + RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3); \ + RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5); \ + RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7); \ + RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9); \ + RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB); \ + memcpy(a, t, sizeof t); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_P(a, r); \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r ++) \ + ROUND_SMALL_Q(a, r); \ + } while (0) + +#else + +#define PERM_SMALL_P(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_P(a, r + 0); \ + ROUND_SMALL_P(a, r + 1); \ + } \ + } while (0) + +#define PERM_SMALL_Q(a) do { \ + int r; \ + for (r = 0; r < 10; r += 2) { \ + ROUND_SMALL_Q(a, r + 0); \ + ROUND_SMALL_Q(a, r + 1); \ + } \ + } while (0) + +#endif + +#define COMPRESS_SMALL do { \ + sph_u32 g[16], m[16]; \ + size_t u; \ + for (u = 0; u < 16; u ++) { \ + m[u] = dec32e_aligned(buf + (u << 2)); \ + g[u] = m[u] ^ H[u]; \ + } \ + PERM_SMALL_P(g); \ + PERM_SMALL_Q(m); \ + for (u = 0; u < 16; u ++) \ + H[u] ^= g[u] ^ m[u]; \ + } while (0) + +#define FINAL_SMALL do { \ + sph_u32 x[16]; \ + size_t u; \ + memcpy(x, H, sizeof x); \ + PERM_SMALL_P(x); \ + for (u = 0; u < 16; u ++) \ + H[u] ^= x[u]; \ + } while (0) + +#define DECL_STATE_BIG \ + sph_u32 H[32]; + +#define READ_STATE_BIG(sc) do { \ + memcpy(H, (sc)->state.narrow, sizeof H); \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + memcpy((sc)->state.narrow, H, sizeof H); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + sph_u32 fu2 = T0up[B32_2(a[b2])]; \ + sph_u32 fd2 = T0dn[B32_2(a[b2])]; \ + sph_u32 fu3 = T1up[B32_3(a[b3])]; \ + sph_u32 fd3 = T1dn[B32_3(a[b3])]; \ + sph_u32 fu6 = T0up[B32_2(a[b6])]; \ + sph_u32 fd6 = T0dn[B32_2(a[b6])]; \ + sph_u32 fu7 = T1up[B32_3(a[b7])]; \ + sph_u32 fd7 = T1dn[B32_3(a[b7])]; \ + t[d0] = T0up[B32_0(a[b0])] \ + ^ T1up[B32_1(a[b1])] \ + ^ R32u(fu2, fd2) \ + ^ R32u(fu3, fd3) \ + ^ T0dn[B32_0(a[b4])] \ + ^ T1dn[B32_1(a[b5])] \ + ^ R32d(fu6, fd6) \ + ^ R32d(fu7, fd7); \ + t[d1] = T0dn[B32_0(a[b0])] \ + ^ T1dn[B32_1(a[b1])] \ + ^ R32d(fu2, fd2) \ + ^ R32d(fu3, fd3) \ + ^ T0up[B32_0(a[b4])] \ + ^ T1up[B32_1(a[b5])] \ + ^ R32u(fu6, fd6) \ + ^ R32u(fu7, fd7); \ + } while (0) + +#else + +#define RBTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \ + t[d0] = T0up[B32_0(a[b0])] \ + ^ T1up[B32_1(a[b1])] \ + ^ T2up[B32_2(a[b2])] \ + ^ T3up[B32_3(a[b3])] \ + ^ T0dn[B32_0(a[b4])] \ + ^ T1dn[B32_1(a[b5])] \ + ^ T2dn[B32_2(a[b6])] \ + ^ T3dn[B32_3(a[b7])]; \ + t[d1] = T0dn[B32_0(a[b0])] \ + ^ T1dn[B32_1(a[b1])] \ + ^ T2dn[B32_2(a[b2])] \ + ^ T3dn[B32_3(a[b3])] \ + ^ T0up[B32_0(a[b4])] \ + ^ T1up[B32_1(a[b5])] \ + ^ T2up[B32_2(a[b6])] \ + ^ T3up[B32_3(a[b7])]; \ + } while (0) + +#endif + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define ROUND_BIG_P(a, r) do { \ + sph_u32 t[32]; \ + size_t u; \ + a[0x00] ^= PC32up(0x00, r); \ + a[0x01] ^= PC32dn(0x00, r); \ + a[0x02] ^= PC32up(0x10, r); \ + a[0x03] ^= PC32dn(0x10, r); \ + a[0x04] ^= PC32up(0x20, r); \ + a[0x05] ^= PC32dn(0x20, r); \ + a[0x06] ^= PC32up(0x30, r); \ + a[0x07] ^= PC32dn(0x30, r); \ + a[0x08] ^= PC32up(0x40, r); \ + a[0x09] ^= PC32dn(0x40, r); \ + a[0x0A] ^= PC32up(0x50, r); \ + a[0x0B] ^= PC32dn(0x50, r); \ + a[0x0C] ^= PC32up(0x60, r); \ + a[0x0D] ^= PC32dn(0x60, r); \ + a[0x0E] ^= PC32up(0x70, r); \ + a[0x0F] ^= PC32dn(0x70, r); \ + a[0x10] ^= PC32up(0x80, r); \ + a[0x11] ^= PC32dn(0x80, r); \ + a[0x12] ^= PC32up(0x90, r); \ + a[0x13] ^= PC32dn(0x90, r); \ + a[0x14] ^= PC32up(0xA0, r); \ + a[0x15] ^= PC32dn(0xA0, r); \ + a[0x16] ^= PC32up(0xB0, r); \ + a[0x17] ^= PC32dn(0xB0, r); \ + a[0x18] ^= PC32up(0xC0, r); \ + a[0x19] ^= PC32dn(0xC0, r); \ + a[0x1A] ^= PC32up(0xD0, r); \ + a[0x1B] ^= PC32dn(0xD0, r); \ + a[0x1C] ^= PC32up(0xE0, r); \ + a[0x1D] ^= PC32dn(0xE0, r); \ + a[0x1E] ^= PC32up(0xF0, r); \ + a[0x1F] ^= PC32dn(0xF0, r); \ + for (u = 0; u < 32; u += 8) { \ + RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \ + u + 0x00, (u + 0x02) & 0x1F, \ + (u + 0x04) & 0x1F, (u + 0x06) & 0x1F, \ + (u + 0x09) & 0x1F, (u + 0x0B) & 0x1F, \ + (u + 0x0D) & 0x1F, (u + 0x17) & 0x1F); \ + RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \ + u + 0x02, (u + 0x04) & 0x1F, \ + (u + 0x06) & 0x1F, (u + 0x08) & 0x1F, \ + (u + 0x0B) & 0x1F, (u + 0x0D) & 0x1F, \ + (u + 0x0F) & 0x1F, (u + 0x19) & 0x1F); \ + RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \ + u + 0x04, (u + 0x06) & 0x1F, \ + (u + 0x08) & 0x1F, (u + 0x0A) & 0x1F, \ + (u + 0x0D) & 0x1F, (u + 0x0F) & 0x1F, \ + (u + 0x11) & 0x1F, (u + 0x1B) & 0x1F); \ + RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \ + u + 0x06, (u + 0x08) & 0x1F, \ + (u + 0x0A) & 0x1F, (u + 0x0C) & 0x1F, \ + (u + 0x0F) & 0x1F, (u + 0x11) & 0x1F, \ + (u + 0x13) & 0x1F, (u + 0x1D) & 0x1F); \ + } \ + memcpy(a, t, sizeof t); \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + sph_u32 t[32]; \ + size_t u; \ + a[0x00] ^= QC32up(0x00, r); \ + a[0x01] ^= QC32dn(0x00, r); \ + a[0x02] ^= QC32up(0x10, r); \ + a[0x03] ^= QC32dn(0x10, r); \ + a[0x04] ^= QC32up(0x20, r); \ + a[0x05] ^= QC32dn(0x20, r); \ + a[0x06] ^= QC32up(0x30, r); \ + a[0x07] ^= QC32dn(0x30, r); \ + a[0x08] ^= QC32up(0x40, r); \ + a[0x09] ^= QC32dn(0x40, r); \ + a[0x0A] ^= QC32up(0x50, r); \ + a[0x0B] ^= QC32dn(0x50, r); \ + a[0x0C] ^= QC32up(0x60, r); \ + a[0x0D] ^= QC32dn(0x60, r); \ + a[0x0E] ^= QC32up(0x70, r); \ + a[0x0F] ^= QC32dn(0x70, r); \ + a[0x10] ^= QC32up(0x80, r); \ + a[0x11] ^= QC32dn(0x80, r); \ + a[0x12] ^= QC32up(0x90, r); \ + a[0x13] ^= QC32dn(0x90, r); \ + a[0x14] ^= QC32up(0xA0, r); \ + a[0x15] ^= QC32dn(0xA0, r); \ + a[0x16] ^= QC32up(0xB0, r); \ + a[0x17] ^= QC32dn(0xB0, r); \ + a[0x18] ^= QC32up(0xC0, r); \ + a[0x19] ^= QC32dn(0xC0, r); \ + a[0x1A] ^= QC32up(0xD0, r); \ + a[0x1B] ^= QC32dn(0xD0, r); \ + a[0x1C] ^= QC32up(0xE0, r); \ + a[0x1D] ^= QC32dn(0xE0, r); \ + a[0x1E] ^= QC32up(0xF0, r); \ + a[0x1F] ^= QC32dn(0xF0, r); \ + for (u = 0; u < 32; u += 8) { \ + RBTT(u + 0x00, (u + 0x01) & 0x1F, a, \ + (u + 0x02) & 0x1F, (u + 0x06) & 0x1F, \ + (u + 0x0A) & 0x1F, (u + 0x16) & 0x1F, \ + (u + 0x01) & 0x1F, (u + 0x05) & 0x1F, \ + (u + 0x09) & 0x1F, (u + 0x0D) & 0x1F); \ + RBTT(u + 0x02, (u + 0x03) & 0x1F, a, \ + (u + 0x04) & 0x1F, (u + 0x08) & 0x1F, \ + (u + 0x0C) & 0x1F, (u + 0x18) & 0x1F, \ + (u + 0x03) & 0x1F, (u + 0x07) & 0x1F, \ + (u + 0x0B) & 0x1F, (u + 0x0F) & 0x1F); \ + RBTT(u + 0x04, (u + 0x05) & 0x1F, a, \ + (u + 0x06) & 0x1F, (u + 0x0A) & 0x1F, \ + (u + 0x0E) & 0x1F, (u + 0x1A) & 0x1F, \ + (u + 0x05) & 0x1F, (u + 0x09) & 0x1F, \ + (u + 0x0D) & 0x1F, (u + 0x11) & 0x1F); \ + RBTT(u + 0x06, (u + 0x07) & 0x1F, a, \ + (u + 0x08) & 0x1F, (u + 0x0C) & 0x1F, \ + (u + 0x10) & 0x1F, (u + 0x1C) & 0x1F, \ + (u + 0x07) & 0x1F, (u + 0x0B) & 0x1F, \ + (u + 0x0F) & 0x1F, (u + 0x13) & 0x1F); \ + } \ + memcpy(a, t, sizeof t); \ + } while (0) + +#else + +#define ROUND_BIG_P(a, r) do { \ + sph_u32 t[32]; \ + a[0x00] ^= PC32up(0x00, r); \ + a[0x01] ^= PC32dn(0x00, r); \ + a[0x02] ^= PC32up(0x10, r); \ + a[0x03] ^= PC32dn(0x10, r); \ + a[0x04] ^= PC32up(0x20, r); \ + a[0x05] ^= PC32dn(0x20, r); \ + a[0x06] ^= PC32up(0x30, r); \ + a[0x07] ^= PC32dn(0x30, r); \ + a[0x08] ^= PC32up(0x40, r); \ + a[0x09] ^= PC32dn(0x40, r); \ + a[0x0A] ^= PC32up(0x50, r); \ + a[0x0B] ^= PC32dn(0x50, r); \ + a[0x0C] ^= PC32up(0x60, r); \ + a[0x0D] ^= PC32dn(0x60, r); \ + a[0x0E] ^= PC32up(0x70, r); \ + a[0x0F] ^= PC32dn(0x70, r); \ + a[0x10] ^= PC32up(0x80, r); \ + a[0x11] ^= PC32dn(0x80, r); \ + a[0x12] ^= PC32up(0x90, r); \ + a[0x13] ^= PC32dn(0x90, r); \ + a[0x14] ^= PC32up(0xA0, r); \ + a[0x15] ^= PC32dn(0xA0, r); \ + a[0x16] ^= PC32up(0xB0, r); \ + a[0x17] ^= PC32dn(0xB0, r); \ + a[0x18] ^= PC32up(0xC0, r); \ + a[0x19] ^= PC32dn(0xC0, r); \ + a[0x1A] ^= PC32up(0xD0, r); \ + a[0x1B] ^= PC32dn(0xD0, r); \ + a[0x1C] ^= PC32up(0xE0, r); \ + a[0x1D] ^= PC32dn(0xE0, r); \ + a[0x1E] ^= PC32up(0xF0, r); \ + a[0x1F] ^= PC32dn(0xF0, r); \ + RBTT(0x00, 0x01, a, \ + 0x00, 0x02, 0x04, 0x06, 0x09, 0x0B, 0x0D, 0x17); \ + RBTT(0x02, 0x03, a, \ + 0x02, 0x04, 0x06, 0x08, 0x0B, 0x0D, 0x0F, 0x19); \ + RBTT(0x04, 0x05, a, \ + 0x04, 0x06, 0x08, 0x0A, 0x0D, 0x0F, 0x11, 0x1B); \ + RBTT(0x06, 0x07, a, \ + 0x06, 0x08, 0x0A, 0x0C, 0x0F, 0x11, 0x13, 0x1D); \ + RBTT(0x08, 0x09, a, \ + 0x08, 0x0A, 0x0C, 0x0E, 0x11, 0x13, 0x15, 0x1F); \ + RBTT(0x0A, 0x0B, a, \ + 0x0A, 0x0C, 0x0E, 0x10, 0x13, 0x15, 0x17, 0x01); \ + RBTT(0x0C, 0x0D, a, \ + 0x0C, 0x0E, 0x10, 0x12, 0x15, 0x17, 0x19, 0x03); \ + RBTT(0x0E, 0x0F, a, \ + 0x0E, 0x10, 0x12, 0x14, 0x17, 0x19, 0x1B, 0x05); \ + RBTT(0x10, 0x11, a, \ + 0x10, 0x12, 0x14, 0x16, 0x19, 0x1B, 0x1D, 0x07); \ + RBTT(0x12, 0x13, a, \ + 0x12, 0x14, 0x16, 0x18, 0x1B, 0x1D, 0x1F, 0x09); \ + RBTT(0x14, 0x15, a, \ + 0x14, 0x16, 0x18, 0x1A, 0x1D, 0x1F, 0x01, 0x0B); \ + RBTT(0x16, 0x17, a, \ + 0x16, 0x18, 0x1A, 0x1C, 0x1F, 0x01, 0x03, 0x0D); \ + RBTT(0x18, 0x19, a, \ + 0x18, 0x1A, 0x1C, 0x1E, 0x01, 0x03, 0x05, 0x0F); \ + RBTT(0x1A, 0x1B, a, \ + 0x1A, 0x1C, 0x1E, 0x00, 0x03, 0x05, 0x07, 0x11); \ + RBTT(0x1C, 0x1D, a, \ + 0x1C, 0x1E, 0x00, 0x02, 0x05, 0x07, 0x09, 0x13); \ + RBTT(0x1E, 0x1F, a, \ + 0x1E, 0x00, 0x02, 0x04, 0x07, 0x09, 0x0B, 0x15); \ + memcpy(a, t, sizeof t); \ + } while (0) + +#define ROUND_BIG_Q(a, r) do { \ + sph_u32 t[32]; \ + a[0x00] ^= QC32up(0x00, r); \ + a[0x01] ^= QC32dn(0x00, r); \ + a[0x02] ^= QC32up(0x10, r); \ + a[0x03] ^= QC32dn(0x10, r); \ + a[0x04] ^= QC32up(0x20, r); \ + a[0x05] ^= QC32dn(0x20, r); \ + a[0x06] ^= QC32up(0x30, r); \ + a[0x07] ^= QC32dn(0x30, r); \ + a[0x08] ^= QC32up(0x40, r); \ + a[0x09] ^= QC32dn(0x40, r); \ + a[0x0A] ^= QC32up(0x50, r); \ + a[0x0B] ^= QC32dn(0x50, r); \ + a[0x0C] ^= QC32up(0x60, r); \ + a[0x0D] ^= QC32dn(0x60, r); \ + a[0x0E] ^= QC32up(0x70, r); \ + a[0x0F] ^= QC32dn(0x70, r); \ + a[0x10] ^= QC32up(0x80, r); \ + a[0x11] ^= QC32dn(0x80, r); \ + a[0x12] ^= QC32up(0x90, r); \ + a[0x13] ^= QC32dn(0x90, r); \ + a[0x14] ^= QC32up(0xA0, r); \ + a[0x15] ^= QC32dn(0xA0, r); \ + a[0x16] ^= QC32up(0xB0, r); \ + a[0x17] ^= QC32dn(0xB0, r); \ + a[0x18] ^= QC32up(0xC0, r); \ + a[0x19] ^= QC32dn(0xC0, r); \ + a[0x1A] ^= QC32up(0xD0, r); \ + a[0x1B] ^= QC32dn(0xD0, r); \ + a[0x1C] ^= QC32up(0xE0, r); \ + a[0x1D] ^= QC32dn(0xE0, r); \ + a[0x1E] ^= QC32up(0xF0, r); \ + a[0x1F] ^= QC32dn(0xF0, r); \ + RBTT(0x00, 0x01, a, \ + 0x02, 0x06, 0x0A, 0x16, 0x01, 0x05, 0x09, 0x0D); \ + RBTT(0x02, 0x03, a, \ + 0x04, 0x08, 0x0C, 0x18, 0x03, 0x07, 0x0B, 0x0F); \ + RBTT(0x04, 0x05, a, \ + 0x06, 0x0A, 0x0E, 0x1A, 0x05, 0x09, 0x0D, 0x11); \ + RBTT(0x06, 0x07, a, \ + 0x08, 0x0C, 0x10, 0x1C, 0x07, 0x0B, 0x0F, 0x13); \ + RBTT(0x08, 0x09, a, \ + 0x0A, 0x0E, 0x12, 0x1E, 0x09, 0x0D, 0x11, 0x15); \ + RBTT(0x0A, 0x0B, a, \ + 0x0C, 0x10, 0x14, 0x00, 0x0B, 0x0F, 0x13, 0x17); \ + RBTT(0x0C, 0x0D, a, \ + 0x0E, 0x12, 0x16, 0x02, 0x0D, 0x11, 0x15, 0x19); \ + RBTT(0x0E, 0x0F, a, \ + 0x10, 0x14, 0x18, 0x04, 0x0F, 0x13, 0x17, 0x1B); \ + RBTT(0x10, 0x11, a, \ + 0x12, 0x16, 0x1A, 0x06, 0x11, 0x15, 0x19, 0x1D); \ + RBTT(0x12, 0x13, a, \ + 0x14, 0x18, 0x1C, 0x08, 0x13, 0x17, 0x1B, 0x1F); \ + RBTT(0x14, 0x15, a, \ + 0x16, 0x1A, 0x1E, 0x0A, 0x15, 0x19, 0x1D, 0x01); \ + RBTT(0x16, 0x17, a, \ + 0x18, 0x1C, 0x00, 0x0C, 0x17, 0x1B, 0x1F, 0x03); \ + RBTT(0x18, 0x19, a, \ + 0x1A, 0x1E, 0x02, 0x0E, 0x19, 0x1D, 0x01, 0x05); \ + RBTT(0x1A, 0x1B, a, \ + 0x1C, 0x00, 0x04, 0x10, 0x1B, 0x1F, 0x03, 0x07); \ + RBTT(0x1C, 0x1D, a, \ + 0x1E, 0x02, 0x06, 0x12, 0x1D, 0x01, 0x05, 0x09); \ + RBTT(0x1E, 0x1F, a, \ + 0x00, 0x04, 0x08, 0x14, 0x1F, 0x03, 0x07, 0x0B); \ + memcpy(a, t, sizeof t); \ + } while (0) + +#endif + +#if SPH_SMALL_FOOTPRINT_GROESTL + +#define PERM_BIG_P(a) do { \ + int r; \ + for (r = 0; r < 14; r ++) \ + ROUND_BIG_P(a, r); \ + } while (0) + +#define PERM_BIG_Q(a) do { \ + int r; \ + for (r = 0; r < 14; r ++) \ + ROUND_BIG_Q(a, r); \ + } while (0) + +#else + +#define PERM_BIG_P(a) do { \ + int r; \ + for (r = 0; r < 14; r += 2) { \ + ROUND_BIG_P(a, r + 0); \ + ROUND_BIG_P(a, r + 1); \ + } \ + } while (0) + +#define PERM_BIG_Q(a) do { \ + int r; \ + for (r = 0; r < 14; r += 2) { \ + ROUND_BIG_Q(a, r + 0); \ + ROUND_BIG_Q(a, r + 1); \ + } \ + } while (0) + +#endif + +#define COMPRESS_BIG do { \ + sph_u32 g[32], m[32]; \ + size_t u; \ + for (u = 0; u < 32; u ++) { \ + m[u] = dec32e_aligned(buf + (u << 2)); \ + g[u] = m[u] ^ H[u]; \ + } \ + PERM_BIG_P(g); \ + PERM_BIG_Q(m); \ + for (u = 0; u < 32; u ++) \ + H[u] ^= g[u] ^ m[u]; \ + } while (0) + +#define FINAL_BIG do { \ + sph_u32 x[32]; \ + size_t u; \ + memcpy(x, H, sizeof x); \ + PERM_BIG_P(x); \ + for (u = 0; u < 32; u ++) \ + H[u] ^= x[u]; \ + } while (0) + +#endif + +static void +groestl_small_init(sph_groestl_small_context *sc, unsigned out_size) +{ + size_t u; + + sc->ptr = 0; +#if SPH_GROESTL_64 + for (u = 0; u < 7; u ++) + sc->state.wide[u] = 0; +#if USE_LE + sc->state.wide[7] = ((sph_u64)(out_size & 0xFF) << 56) + | ((sph_u64)(out_size & 0xFF00) << 40); +#else + sc->state.wide[7] = (sph_u64)out_size; +#endif +#else + for (u = 0; u < 15; u ++) + sc->state.narrow[u] = 0; +#if USE_LE + sc->state.narrow[15] = ((sph_u32)(out_size & 0xFF) << 24) + | ((sph_u32)(out_size & 0xFF00) << 8); +#else + sc->state.narrow[15] = (sph_u32)out_size; +#endif +#endif +#if SPH_64 + sc->count = 0; +#else + sc->count_high = 0; + sc->count_low = 0; +#endif +} + +static void +groestl_small_core(sph_groestl_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE_SMALL + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE_SMALL(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + COMPRESS_SMALL; +#if SPH_64 + sc->count ++; +#else + if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0) + sc->count_high = SPH_T32(sc->count_high + 1); +#endif + ptr = 0; + } + } + WRITE_STATE_SMALL(sc); + sc->ptr = ptr; +} + +static void +groestl_small_close(sph_groestl_small_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_len) +{ + unsigned char pad[72]; + size_t u, ptr, pad_len; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif + unsigned z; + DECL_STATE_SMALL + + ptr = sc->ptr; + z = 0x80 >> n; + pad[0] = ((ub & -z) | z) & 0xFF; + if (ptr < 56) { + pad_len = 64 - ptr; +#if SPH_64 + count = SPH_T64(sc->count + 1); +#else + count_low = SPH_T32(sc->count_low + 1); + count_high = SPH_T32(sc->count_high); + if (count_low == 0) + count_high = SPH_T32(count_high + 1); +#endif + } else { + pad_len = 128 - ptr; +#if SPH_64 + count = SPH_T64(sc->count + 2); +#else + count_low = SPH_T32(sc->count_low + 2); + count_high = SPH_T32(sc->count_high); + if (count_low <= 1) + count_high = SPH_T32(count_high + 1); +#endif + } + memset(pad + 1, 0, pad_len - 9); +#if SPH_64 + sph_enc64be(pad + pad_len - 8, count); +#else + sph_enc64be(pad + pad_len - 8, count_high); + sph_enc64be(pad + pad_len - 4, count_low); +#endif + groestl_small_core(sc, pad, pad_len); + READ_STATE_SMALL(sc); + FINAL_SMALL; +#if SPH_GROESTL_64 + for (u = 0; u < 4; u ++) + enc64e(pad + (u << 3), H[u + 4]); +#else + for (u = 0; u < 8; u ++) + enc32e(pad + (u << 2), H[u + 8]); +#endif + memcpy(dst, pad + 32 - out_len, out_len); + groestl_small_init(sc, (unsigned)out_len << 3); +} + +static void +groestl_big_init(sph_groestl_big_context *sc, unsigned out_size) +{ + size_t u; + + sc->ptr = 0; +#if SPH_GROESTL_64 + for (u = 0; u < 15; u ++) + sc->state.wide[u] = 0; +#if USE_LE + sc->state.wide[15] = ((sph_u64)(out_size & 0xFF) << 56) + | ((sph_u64)(out_size & 0xFF00) << 40); +#else + sc->state.wide[15] = (sph_u64)out_size; +#endif +#else + for (u = 0; u < 31; u ++) + sc->state.narrow[u] = 0; +#if USE_LE + sc->state.narrow[31] = ((sph_u32)(out_size & 0xFF) << 24) + | ((sph_u32)(out_size & 0xFF00) << 8); +#else + sc->state.narrow[31] = (sph_u32)out_size; +#endif +#endif +#if SPH_64 + sc->count = 0; +#else + sc->count_high = 0; + sc->count_low = 0; +#endif +} + +static void +groestl_big_core(sph_groestl_big_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE_BIG + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE_BIG(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + COMPRESS_BIG; +#if SPH_64 + sc->count ++; +#else + if ((sc->count_low = SPH_T32(sc->count_low + 1)) == 0) + sc->count_high = SPH_T32(sc->count_high + 1); +#endif + ptr = 0; + } + } + WRITE_STATE_BIG(sc); + sc->ptr = ptr; +} + +static void +groestl_big_close(sph_groestl_big_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_len) +{ + unsigned char pad[136]; + size_t ptr, pad_len, u; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif + unsigned z; + DECL_STATE_BIG + + ptr = sc->ptr; + z = 0x80 >> n; + pad[0] = ((ub & -z) | z) & 0xFF; + if (ptr < 120) { + pad_len = 128 - ptr; +#if SPH_64 + count = SPH_T64(sc->count + 1); +#else + count_low = SPH_T32(sc->count_low + 1); + count_high = SPH_T32(sc->count_high); + if (count_low == 0) + count_high = SPH_T32(count_high + 1); +#endif + } else { + pad_len = 256 - ptr; +#if SPH_64 + count = SPH_T64(sc->count + 2); +#else + count_low = SPH_T32(sc->count_low + 2); + count_high = SPH_T32(sc->count_high); + if (count_low <= 1) + count_high = SPH_T32(count_high + 1); +#endif + } + memset(pad + 1, 0, pad_len - 9); +#if SPH_64 + sph_enc64be(pad + pad_len - 8, count); +#else + sph_enc64be(pad + pad_len - 8, count_high); + sph_enc64be(pad + pad_len - 4, count_low); +#endif + groestl_big_core(sc, pad, pad_len); + READ_STATE_BIG(sc); + FINAL_BIG; +#if SPH_GROESTL_64 + for (u = 0; u < 8; u ++) + enc64e(pad + (u << 3), H[u + 8]); +#else + for (u = 0; u < 16; u ++) + enc32e(pad + (u << 2), H[u + 16]); +#endif + memcpy(dst, pad + 64 - out_len, out_len); + groestl_big_init(sc, (unsigned)out_len << 3); +} + +/* see sph_groestl.h */ +void +sph_groestl224_init(void *cc) +{ + groestl_small_init(cc, 224); +} + +/* see sph_groestl.h */ +void +sph_groestl224(void *cc, const void *data, size_t len) +{ + groestl_small_core(cc, data, len); +} + +/* see sph_groestl.h */ +void +sph_groestl224_close(void *cc, void *dst) +{ + groestl_small_close(cc, 0, 0, dst, 28); +} + +/* see sph_groestl.h */ +void +sph_groestl224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + groestl_small_close(cc, ub, n, dst, 28); +} + +/* see sph_groestl.h */ +void +sph_groestl256_init(void *cc) +{ + groestl_small_init(cc, 256); +} + +/* see sph_groestl.h */ +void +sph_groestl256(void *cc, const void *data, size_t len) +{ + groestl_small_core(cc, data, len); +} + +/* see sph_groestl.h */ +void +sph_groestl256_close(void *cc, void *dst) +{ + groestl_small_close(cc, 0, 0, dst, 32); +} + +/* see sph_groestl.h */ +void +sph_groestl256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + groestl_small_close(cc, ub, n, dst, 32); +} + +/* see sph_groestl.h */ +void +sph_groestl384_init(void *cc) +{ + groestl_big_init(cc, 384); +} + +/* see sph_groestl.h */ +void +sph_groestl384(void *cc, const void *data, size_t len) +{ + groestl_big_core(cc, data, len); +} + +/* see sph_groestl.h */ +void +sph_groestl384_close(void *cc, void *dst) +{ + groestl_big_close(cc, 0, 0, dst, 48); +} + +/* see sph_groestl.h */ +void +sph_groestl384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + groestl_big_close(cc, ub, n, dst, 48); +} + +/* see sph_groestl.h */ +void +sph_groestl512_init(void *cc) +{ + groestl_big_init(cc, 512); +} + +/* see sph_groestl.h */ +void +sph_groestl512(void *cc, const void *data, size_t len) +{ + groestl_big_core(cc, data, len); +} + +/* see sph_groestl.h */ +void +sph_groestl512_close(void *cc, void *dst) +{ + groestl_big_close(cc, 0, 0, dst, 64); +} + +/* see sph_groestl.h */ +void +sph_groestl512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + groestl_big_close(cc, ub, n, dst, 64); +} + +#ifdef __cplusplus +} +#endif diff --git a/sha3/sph_groestl.h b/sha3/sph_groestl.h new file mode 100644 index 00000000..495f05e2 --- /dev/null +++ b/sha3/sph_groestl.h @@ -0,0 +1,329 @@ +/* $Id: sph_groestl.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * Groestl interface. This code implements Groestl with the recommended + * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_groestl.h + * @author Thomas Pornin + */ + +#ifndef SPH_GROESTL_H__ +#define SPH_GROESTL_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for Groestl-224. + */ +#define SPH_SIZE_groestl224 224 + +/** + * Output size (in bits) for Groestl-256. + */ +#define SPH_SIZE_groestl256 256 + +/** + * Output size (in bits) for Groestl-384. + */ +#define SPH_SIZE_groestl384 384 + +/** + * Output size (in bits) for Groestl-512. + */ +#define SPH_SIZE_groestl512 512 + +/** + * This structure is a context for Groestl-224 and Groestl-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a Groestl computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running Groestl + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + union { +#if SPH_64 + sph_u64 wide[8]; +#endif + sph_u32 narrow[16]; + } state; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif +#endif +} sph_groestl_small_context; + +/** + * This structure is a context for Groestl-224 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_small_context sph_groestl224_context; + +/** + * This structure is a context for Groestl-256 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_small_context sph_groestl256_context; + +/** + * This structure is a context for Groestl-384 and Groestl-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a Groestl computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running Groestl + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + union { +#if SPH_64 + sph_u64 wide[16]; +#endif + sph_u32 narrow[32]; + } state; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif +#endif +} sph_groestl_big_context; + +/** + * This structure is a context for Groestl-384 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_big_context sph_groestl384_context; + +/** + * This structure is a context for Groestl-512 computations. It is + * identical to the common sph_groestl_small_context. + */ +typedef sph_groestl_big_context sph_groestl512_context; + +/** + * Initialize a Groestl-224 context. This process performs no memory allocation. + * + * @param cc the Groestl-224 context (pointer to a + * sph_groestl224_context) + */ +void sph_groestl224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-224 context + * @param dst the destination buffer + */ +void sph_groestl224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Groestl-256 context. This process performs no memory allocation. + * + * @param cc the Groestl-256 context (pointer to a + * sph_groestl256_context) + */ +void sph_groestl256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-256 context + * @param dst the destination buffer + */ +void sph_groestl256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Groestl-384 context. This process performs no memory allocation. + * + * @param cc the Groestl-384 context (pointer to a + * sph_groestl384_context) + */ +void sph_groestl384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-384 context + * @param dst the destination buffer + */ +void sph_groestl384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Groestl-512 context. This process performs no memory allocation. + * + * @param cc the Groestl-512 context (pointer to a + * sph_groestl512_context) + */ +void sph_groestl512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Groestl-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_groestl512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Groestl-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Groestl-512 context + * @param dst the destination buffer + */ +void sph_groestl512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Groestl-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_groestl512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_hefty1.c b/sha3/sph_hefty1.c new file mode 100644 index 00000000..fadd151e --- /dev/null +++ b/sha3/sph_hefty1.c @@ -0,0 +1,378 @@ +/* + * HEFTY1 cryptographic hash function + * + * Copyright (c) 2014, dbcc14 + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation are those + * of the authors and should not be interpreted as representing official policies, + * either expressed or implied, of the FreeBSD Project. + */ + +#include +#include + +#include "sph_hefty1.h" + +#define Min(A, B) (A <= B ? A : B) +#define RoundFunc(ctx, A, B, C, D, E, F, G, H, W, K) \ + { \ + /* To thwart parallelism, Br modifies itself each time it's \ + * called. This also means that calling it in different \ + * orders yeilds different results. In C the order of \ + * evaluation of function arguments and + operands are \ + * unspecified (and depends on the compiler), so we must make \ + * the order of Br calls explicit. \ + */ \ + uint32_t brG = Br(ctx, G); \ + uint32_t tmp1 = Ch(E, Br(ctx, F), brG) + H + W + K; \ + uint32_t tmp2 = tmp1 + Sigma1(Br(ctx, E)); \ + uint32_t brC = Br(ctx, C); \ + uint32_t brB = Br(ctx, B); \ + uint32_t tmp3 = Ma(Br(ctx, A), brB, brC); \ + uint32_t tmp4 = tmp3 + Sigma0(Br(ctx, A)); \ + H = G; \ + G = F; \ + F = E; \ + E = D + Br(ctx, tmp2); \ + D = C; \ + C = B; \ + B = A; \ + A = tmp2 + tmp4; \ + } \ + +/* Nothing up my sleeve constants */ +const static uint32_t K[64] = { + 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, + 0x3956c25bUL, 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, + 0xd807aa98UL, 0x12835b01UL, 0x243185beUL, 0x550c7dc3UL, + 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, 0xc19bf174UL, + 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, + 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, + 0x983e5152UL, 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, + 0xc6e00bf3UL, 0xd5a79147UL, 0x06ca6351UL, 0x14292967UL, + 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, 0x53380d13UL, + 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, + 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, + 0xd192e819UL, 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, + 0x19a4c116UL, 0x1e376c08UL, 0x2748774cUL, 0x34b0bcb5UL, + 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, 0x682e6ff3UL, + 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, + 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL +}; + +/* Initial hash values */ +const static uint32_t H[HEFTY1_STATE_WORDS] = { + 0x6a09e667UL, + 0xbb67ae85UL, + 0x3c6ef372UL, + 0xa54ff53aUL, + 0x510e527fUL, + 0x9b05688cUL, + 0x1f83d9abUL, + 0x5be0cd19UL +}; + +static inline uint32_t Rr(uint32_t X, uint8_t n) +{ + return (X >> n) | (X << (32 - n)); +} + +static inline uint32_t Ch(uint32_t E, uint32_t F, uint32_t G) +{ + return (E & F) ^ (~E & G); +} + +static inline uint32_t Sigma1(uint32_t E) +{ + return Rr(E, 6) ^ Rr(E, 11) ^ Rr(E, 25); +} + +static inline uint32_t sigma1(uint32_t X) +{ + return Rr(X, 17) ^ Rr(X, 19) ^ (X >> 10); +} + +static inline uint32_t Ma(uint32_t A, uint32_t B, uint32_t C) +{ + return (A & B) ^ (A & C) ^ (B & C); +} + +static inline uint32_t Sigma0(uint32_t A) +{ + return Rr(A, 2) ^ Rr(A, 13) ^ Rr(A, 22); +} + +static inline uint32_t sigma0(uint32_t X) +{ + return Rr(X, 7) ^ Rr(X, 18) ^ (X >> 3); +} + +static inline uint32_t Reverse32(uint32_t n) +{ + #if BYTE_ORDER == LITTLE_ENDIAN + return n << 24 | (n & 0x0000ff00) << 8 | (n & 0x00ff0000) >> 8 | n >> 24; + #else + return n; + #endif +} + +static inline uint64_t Reverse64(uint64_t n) +{ + #if BYTE_ORDER == LITTLE_ENDIAN + uint32_t a = n >> 32; + uint32_t b = (n << 32) >> 32; + + return (uint64_t)Reverse32(b) << 32 | Reverse32(a); + #else + return n; + #endif +} + +/* Smoosh byte into nibble */ +static inline uint8_t Smoosh4(uint8_t X) +{ + return (X >> 4) ^ (X & 0xf); +} + +/* Smoosh 32-bit word into 2-bits */ +static inline uint8_t Smoosh2(uint32_t X) +{ + uint16_t w = (X >> 16) ^ (X & 0xffff); + uint8_t n = Smoosh4((w >> 8) ^ (w & 0xff)); + return (n >> 2) ^ (n & 0x3); +} + +static void Mangle(uint32_t *S) +{ + uint32_t *R = S; + uint32_t *C = &S[1]; + + uint8_t r0 = Smoosh4(R[0] >> 24); + uint8_t r1 = Smoosh4(R[0] >> 16); + uint8_t r2 = Smoosh4(R[0] >> 8); + uint8_t r3 = Smoosh4(R[0] & 0xff); + + int i; + + /* Diffuse */ + uint32_t tmp = 0; + for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) { + uint8_t r = Smoosh2(tmp); + switch (r) { + case 0: + C[i] ^= Rr(R[0], i + r0); + break; + case 1: + C[i] += Rr(~R[0], i + r1); + break; + case 2: + C[i] &= Rr(~R[0], i + r2); + break; + case 3: + C[i] ^= Rr(R[0], i + r3); + break; + } + tmp ^= C[i]; + } + + /* Compress */ + tmp = 0; + for (i = 0; i < HEFTY1_SPONGE_WORDS - 1; i++) + if (i % 2) + tmp ^= C[i]; + else + tmp += C[i]; + R[0] ^= tmp; +} + +static void Absorb(uint32_t *S, uint32_t X) +{ + uint32_t *R = S; + R[0] ^= X; + Mangle(S); +} + +static uint32_t Squeeze(uint32_t *S) +{ + uint32_t Y = S[0]; + Mangle(S); + return Y; +} + +/* Branch, compress and serialize function */ +static inline uint32_t Br(HEFTY1_CTX *ctx, uint32_t X) +{ + uint32_t R = Squeeze(ctx->sponge); + + uint8_t r0 = R >> 8; + uint8_t r1 = R & 0xff; + + uint32_t Y = 1 << (r0 % 32); + + switch (r1 % 4) + { + case 0: + /* Do nothing */ + break; + case 1: + return X & ~Y; + case 2: + return X | Y; + case 3: + return X ^ Y; + } + + return X; +} + +static void HashBlock(HEFTY1_CTX *ctx) +{ + uint32_t A, B, C, D, E, F, G, H; + uint32_t W[HEFTY1_BLOCK_BYTES]; + + assert(ctx); + + A = ctx->h[0]; + B = ctx->h[1]; + C = ctx->h[2]; + D = ctx->h[3]; + E = ctx->h[4]; + F = ctx->h[5]; + G = ctx->h[6]; + H = ctx->h[7]; + + int t = 0; + for (; t < 16; t++) { + W[t] = Reverse32(((uint32_t *)&ctx->block[0])[t]); /* To host byte order */ + Absorb(ctx->sponge, W[t] ^ K[t]); + } + + for (t = 0; t < 16; t++) { + Absorb(ctx->sponge, D ^ H); + RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]); + } + for (t = 16; t < 64; t++) { + Absorb(ctx->sponge, H + D); + W[t] = sigma1(W[t - 2]) + W[t - 7] + sigma0(W[t - 15]) + W[t - 16]; + RoundFunc(ctx, A, B, C, D, E, F, G, H, W[t], K[t]); + } + + ctx->h[0] += A; + ctx->h[1] += B; + ctx->h[2] += C; + ctx->h[3] += D; + ctx->h[4] += E; + ctx->h[5] += F; + ctx->h[6] += G; + ctx->h[7] += H; + + A = 0; + B = 0; + C = 0; + D = 0; + E = 0; + F = 0; + G = 0; + H = 0; + + memset(W, 0, sizeof(W)); +} + +/* Public interface */ + +void HEFTY1_Init(HEFTY1_CTX *ctx) +{ + assert(ctx); + + memcpy(ctx->h, H, sizeof(ctx->h)); + memset(ctx->block, 0, sizeof(ctx->block)); + ctx->written = 0; + memset(ctx->sponge, 0, sizeof(ctx->sponge)); +} + +void HEFTY1_Update(HEFTY1_CTX *ctx, const void *buf, size_t len) +{ + assert(ctx); + + uint64_t read = 0; + while (len) { + uint64_t end = ctx->written % HEFTY1_BLOCK_BYTES; + uint64_t count = Min(len, HEFTY1_BLOCK_BYTES - end); + memcpy(&ctx->block[end], &((unsigned char *)buf)[read], count); + len -= count; + read += count; + ctx->written += count; + if (!(ctx->written % HEFTY1_BLOCK_BYTES)) + HashBlock(ctx); + } +} + +void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *ctx) +{ + assert(digest); + assert(ctx); + + /* Pad message (FIPS 180 Section 5.1.1) */ + uint64_t used = ctx->written % HEFTY1_BLOCK_BYTES; + ctx->block[used++] = 0x80; /* Append 1 to end of message */ + if (used > HEFTY1_BLOCK_BYTES - 8) { + /* We have already written into the last 64bits, so + * we must continue into the next block. */ + memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - used); + HashBlock(ctx); + used = 0; /* Create a new block (below) */ + } + + /* All remaining bits to zero */ + memset(&ctx->block[used], 0, HEFTY1_BLOCK_BYTES - 8 - used); + + /* The last 64bits encode the length (in network byte order) */ + uint64_t *len = (uint64_t *)&ctx->block[HEFTY1_BLOCK_BYTES - 8]; + *len = Reverse64(ctx->written*8); + + HashBlock(ctx); + + /* Convert back to network byte order */ + int i = 0; + for (; i < HEFTY1_STATE_WORDS; i++) + ctx->h[i] = Reverse32(ctx->h[i]); + + memcpy(digest, ctx->h, sizeof(ctx->h)); + memset(ctx, 0, sizeof(HEFTY1_CTX)); +} + +unsigned char* HEFTY1(const unsigned char *buf, size_t len, unsigned char *digest) +{ + HEFTY1_CTX ctx; + static unsigned char m[HEFTY1_DIGEST_BYTES]; + + if (!digest) + digest = m; + + HEFTY1_Init(&ctx); + HEFTY1_Update(&ctx, buf, len); + HEFTY1_Final(digest, &ctx); + + return digest; +} \ No newline at end of file diff --git a/sha3/sph_hefty1.h b/sha3/sph_hefty1.h new file mode 100644 index 00000000..afcd274f --- /dev/null +++ b/sha3/sph_hefty1.h @@ -0,0 +1,66 @@ +/* + * HEFTY1 cryptographic hash function + * + * Copyright (c) 2014, dbcc14 + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * The views and conclusions contained in the software and documentation are those + * of the authors and should not be interpreted as representing official policies, + * either expressed or implied, of the FreeBSD Project. + */ + +#ifndef __HEFTY1_H__ +#define __HEFTY1_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef WIN32 +#include +#endif + +#include + +#define HEFTY1_DIGEST_BYTES 32 +#define HEFTY1_BLOCK_BYTES 64 +#define HEFTY1_STATE_WORDS 8 +#define HEFTY1_SPONGE_WORDS 4 + +typedef struct HEFTY1_CTX { + uint32_t h[HEFTY1_STATE_WORDS]; + uint8_t block[HEFTY1_BLOCK_BYTES]; + uint64_t written; + uint32_t sponge[HEFTY1_SPONGE_WORDS]; +} HEFTY1_CTX; + +void HEFTY1_Init(HEFTY1_CTX *cxt); +void HEFTY1_Update(HEFTY1_CTX *cxt, const void *data, size_t len); +void HEFTY1_Final(unsigned char *digest, HEFTY1_CTX *cxt); +unsigned char* HEFTY1(const unsigned char *data, size_t len, unsigned char *digest); + +#ifdef __cplusplus +} +#endif + +#endif /* __HEFTY1_H__ */ \ No newline at end of file diff --git a/sha3/sph_jh.c b/sha3/sph_jh.c new file mode 100644 index 00000000..41487a53 --- /dev/null +++ b/sha3/sph_jh.c @@ -0,0 +1,1116 @@ +/* $Id: jh.c 255 2011-06-07 19:50:20Z tp $ */ +/* + * JH implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_jh.h" + +#ifdef __cplusplus +extern "C"{ +#endif + + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_JH +#define SPH_SMALL_FOOTPRINT_JH 1 +#endif + +#if !defined SPH_JH_64 && SPH_64_TRUE +#define SPH_JH_64 1 +#endif + +#if !SPH_64 +#undef SPH_JH_64 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +/* + * The internal bitslice representation may use either big-endian or + * little-endian (true bitslice operations do not care about the bit + * ordering, and the bit-swapping linear operations in JH happen to + * be invariant through endianness-swapping). The constants must be + * defined according to the chosen endianness; we use some + * byte-swapping macros for that. + */ + +#if SPH_LITTLE_ENDIAN + +#define C32e(x) ((SPH_C32(x) >> 24) \ + | ((SPH_C32(x) >> 8) & SPH_C32(0x0000FF00)) \ + | ((SPH_C32(x) << 8) & SPH_C32(0x00FF0000)) \ + | ((SPH_C32(x) << 24) & SPH_C32(0xFF000000))) +#define dec32e_aligned sph_dec32le_aligned +#define enc32e sph_enc32le + +#if SPH_64 +#define C64e(x) ((SPH_C64(x) >> 56) \ + | ((SPH_C64(x) >> 40) & SPH_C64(0x000000000000FF00)) \ + | ((SPH_C64(x) >> 24) & SPH_C64(0x0000000000FF0000)) \ + | ((SPH_C64(x) >> 8) & SPH_C64(0x00000000FF000000)) \ + | ((SPH_C64(x) << 8) & SPH_C64(0x000000FF00000000)) \ + | ((SPH_C64(x) << 24) & SPH_C64(0x0000FF0000000000)) \ + | ((SPH_C64(x) << 40) & SPH_C64(0x00FF000000000000)) \ + | ((SPH_C64(x) << 56) & SPH_C64(0xFF00000000000000))) +#define dec64e_aligned sph_dec64le_aligned +#define enc64e sph_enc64le +#endif + +#else + +#define C32e(x) SPH_C32(x) +#define dec32e_aligned sph_dec32be_aligned +#define enc32e sph_enc32be +#if SPH_64 +#define C64e(x) SPH_C64(x) +#define dec64e_aligned sph_dec64be_aligned +#define enc64e sph_enc64be +#endif + +#endif + +#define Sb(x0, x1, x2, x3, c) do { \ + x3 = ~x3; \ + x0 ^= (c) & ~x2; \ + tmp = (c) ^ (x0 & x1); \ + x0 ^= x2 & x3; \ + x3 ^= ~x1 & x2; \ + x1 ^= x0 & x2; \ + x2 ^= x0 & ~x3; \ + x0 ^= x1 | x3; \ + x3 ^= x1 & x2; \ + x1 ^= tmp & x0; \ + x2 ^= tmp; \ + } while (0) + +#define Lb(x0, x1, x2, x3, x4, x5, x6, x7) do { \ + x4 ^= x1; \ + x5 ^= x2; \ + x6 ^= x3 ^ x0; \ + x7 ^= x0; \ + x0 ^= x5; \ + x1 ^= x6; \ + x2 ^= x7 ^ x4; \ + x3 ^= x4; \ + } while (0) + +#if SPH_JH_64 + +static const sph_u64 C[] = { + C64e(0x72d5dea2df15f867), C64e(0x7b84150ab7231557), + C64e(0x81abd6904d5a87f6), C64e(0x4e9f4fc5c3d12b40), + C64e(0xea983ae05c45fa9c), C64e(0x03c5d29966b2999a), + C64e(0x660296b4f2bb538a), C64e(0xb556141a88dba231), + C64e(0x03a35a5c9a190edb), C64e(0x403fb20a87c14410), + C64e(0x1c051980849e951d), C64e(0x6f33ebad5ee7cddc), + C64e(0x10ba139202bf6b41), C64e(0xdc786515f7bb27d0), + C64e(0x0a2c813937aa7850), C64e(0x3f1abfd2410091d3), + C64e(0x422d5a0df6cc7e90), C64e(0xdd629f9c92c097ce), + C64e(0x185ca70bc72b44ac), C64e(0xd1df65d663c6fc23), + C64e(0x976e6c039ee0b81a), C64e(0x2105457e446ceca8), + C64e(0xeef103bb5d8e61fa), C64e(0xfd9697b294838197), + C64e(0x4a8e8537db03302f), C64e(0x2a678d2dfb9f6a95), + C64e(0x8afe7381f8b8696c), C64e(0x8ac77246c07f4214), + C64e(0xc5f4158fbdc75ec4), C64e(0x75446fa78f11bb80), + C64e(0x52de75b7aee488bc), C64e(0x82b8001e98a6a3f4), + C64e(0x8ef48f33a9a36315), C64e(0xaa5f5624d5b7f989), + C64e(0xb6f1ed207c5ae0fd), C64e(0x36cae95a06422c36), + C64e(0xce2935434efe983d), C64e(0x533af974739a4ba7), + C64e(0xd0f51f596f4e8186), C64e(0x0e9dad81afd85a9f), + C64e(0xa7050667ee34626a), C64e(0x8b0b28be6eb91727), + C64e(0x47740726c680103f), C64e(0xe0a07e6fc67e487b), + C64e(0x0d550aa54af8a4c0), C64e(0x91e3e79f978ef19e), + C64e(0x8676728150608dd4), C64e(0x7e9e5a41f3e5b062), + C64e(0xfc9f1fec4054207a), C64e(0xe3e41a00cef4c984), + C64e(0x4fd794f59dfa95d8), C64e(0x552e7e1124c354a5), + C64e(0x5bdf7228bdfe6e28), C64e(0x78f57fe20fa5c4b2), + C64e(0x05897cefee49d32e), C64e(0x447e9385eb28597f), + C64e(0x705f6937b324314a), C64e(0x5e8628f11dd6e465), + C64e(0xc71b770451b920e7), C64e(0x74fe43e823d4878a), + C64e(0x7d29e8a3927694f2), C64e(0xddcb7a099b30d9c1), + C64e(0x1d1b30fb5bdc1be0), C64e(0xda24494ff29c82bf), + C64e(0xa4e7ba31b470bfff), C64e(0x0d324405def8bc48), + C64e(0x3baefc3253bbd339), C64e(0x459fc3c1e0298ba0), + C64e(0xe5c905fdf7ae090f), C64e(0x947034124290f134), + C64e(0xa271b701e344ed95), C64e(0xe93b8e364f2f984a), + C64e(0x88401d63a06cf615), C64e(0x47c1444b8752afff), + C64e(0x7ebb4af1e20ac630), C64e(0x4670b6c5cc6e8ce6), + C64e(0xa4d5a456bd4fca00), C64e(0xda9d844bc83e18ae), + C64e(0x7357ce453064d1ad), C64e(0xe8a6ce68145c2567), + C64e(0xa3da8cf2cb0ee116), C64e(0x33e906589a94999a), + C64e(0x1f60b220c26f847b), C64e(0xd1ceac7fa0d18518), + C64e(0x32595ba18ddd19d3), C64e(0x509a1cc0aaa5b446), + C64e(0x9f3d6367e4046bba), C64e(0xf6ca19ab0b56ee7e), + C64e(0x1fb179eaa9282174), C64e(0xe9bdf7353b3651ee), + C64e(0x1d57ac5a7550d376), C64e(0x3a46c2fea37d7001), + C64e(0xf735c1af98a4d842), C64e(0x78edec209e6b6779), + C64e(0x41836315ea3adba8), C64e(0xfac33b4d32832c83), + C64e(0xa7403b1f1c2747f3), C64e(0x5940f034b72d769a), + C64e(0xe73e4e6cd2214ffd), C64e(0xb8fd8d39dc5759ef), + C64e(0x8d9b0c492b49ebda), C64e(0x5ba2d74968f3700d), + C64e(0x7d3baed07a8d5584), C64e(0xf5a5e9f0e4f88e65), + C64e(0xa0b8a2f436103b53), C64e(0x0ca8079e753eec5a), + C64e(0x9168949256e8884f), C64e(0x5bb05c55f8babc4c), + C64e(0xe3bb3b99f387947b), C64e(0x75daf4d6726b1c5d), + C64e(0x64aeac28dc34b36d), C64e(0x6c34a550b828db71), + C64e(0xf861e2f2108d512a), C64e(0xe3db643359dd75fc), + C64e(0x1cacbcf143ce3fa2), C64e(0x67bbd13c02e843b0), + C64e(0x330a5bca8829a175), C64e(0x7f34194db416535c), + C64e(0x923b94c30e794d1e), C64e(0x797475d7b6eeaf3f), + C64e(0xeaa8d4f7be1a3921), C64e(0x5cf47e094c232751), + C64e(0x26a32453ba323cd2), C64e(0x44a3174a6da6d5ad), + C64e(0xb51d3ea6aff2c908), C64e(0x83593d98916b3c56), + C64e(0x4cf87ca17286604d), C64e(0x46e23ecc086ec7f6), + C64e(0x2f9833b3b1bc765e), C64e(0x2bd666a5efc4e62a), + C64e(0x06f4b6e8bec1d436), C64e(0x74ee8215bcef2163), + C64e(0xfdc14e0df453c969), C64e(0xa77d5ac406585826), + C64e(0x7ec1141606e0fa16), C64e(0x7e90af3d28639d3f), + C64e(0xd2c9f2e3009bd20c), C64e(0x5faace30b7d40c30), + C64e(0x742a5116f2e03298), C64e(0x0deb30d8e3cef89a), + C64e(0x4bc59e7bb5f17992), C64e(0xff51e66e048668d3), + C64e(0x9b234d57e6966731), C64e(0xcce6a6f3170a7505), + C64e(0xb17681d913326cce), C64e(0x3c175284f805a262), + C64e(0xf42bcbb378471547), C64e(0xff46548223936a48), + C64e(0x38df58074e5e6565), C64e(0xf2fc7c89fc86508e), + C64e(0x31702e44d00bca86), C64e(0xf04009a23078474e), + C64e(0x65a0ee39d1f73883), C64e(0xf75ee937e42c3abd), + C64e(0x2197b2260113f86f), C64e(0xa344edd1ef9fdee7), + C64e(0x8ba0df15762592d9), C64e(0x3c85f7f612dc42be), + C64e(0xd8a7ec7cab27b07e), C64e(0x538d7ddaaa3ea8de), + C64e(0xaa25ce93bd0269d8), C64e(0x5af643fd1a7308f9), + C64e(0xc05fefda174a19a5), C64e(0x974d66334cfd216a), + C64e(0x35b49831db411570), C64e(0xea1e0fbbedcd549b), + C64e(0x9ad063a151974072), C64e(0xf6759dbf91476fe2) +}; + +#define Ceven_hi(r) (C[((r) << 2) + 0]) +#define Ceven_lo(r) (C[((r) << 2) + 1]) +#define Codd_hi(r) (C[((r) << 2) + 2]) +#define Codd_lo(r) (C[((r) << 2) + 3]) + +#define S(x0, x1, x2, x3, cb, r) do { \ + Sb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, cb ## hi(r)); \ + Sb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, cb ## lo(r)); \ + } while (0) + +#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \ + Lb(x0 ## h, x1 ## h, x2 ## h, x3 ## h, \ + x4 ## h, x5 ## h, x6 ## h, x7 ## h); \ + Lb(x0 ## l, x1 ## l, x2 ## l, x3 ## l, \ + x4 ## l, x5 ## l, x6 ## l, x7 ## l); \ + } while (0) + +#define Wz(x, c, n) do { \ + sph_u64 t = (x ## h & (c)) << (n); \ + x ## h = ((x ## h >> (n)) & (c)) | t; \ + t = (x ## l & (c)) << (n); \ + x ## l = ((x ## l >> (n)) & (c)) | t; \ + } while (0) + +#define W0(x) Wz(x, SPH_C64(0x5555555555555555), 1) +#define W1(x) Wz(x, SPH_C64(0x3333333333333333), 2) +#define W2(x) Wz(x, SPH_C64(0x0F0F0F0F0F0F0F0F), 4) +#define W3(x) Wz(x, SPH_C64(0x00FF00FF00FF00FF), 8) +#define W4(x) Wz(x, SPH_C64(0x0000FFFF0000FFFF), 16) +#define W5(x) Wz(x, SPH_C64(0x00000000FFFFFFFF), 32) +#define W6(x) do { \ + sph_u64 t = x ## h; \ + x ## h = x ## l; \ + x ## l = t; \ + } while (0) + +#define DECL_STATE \ + sph_u64 h0h, h1h, h2h, h3h, h4h, h5h, h6h, h7h; \ + sph_u64 h0l, h1l, h2l, h3l, h4l, h5l, h6l, h7l; \ + sph_u64 tmp; + +#define READ_STATE(state) do { \ + h0h = (state)->H.wide[ 0]; \ + h0l = (state)->H.wide[ 1]; \ + h1h = (state)->H.wide[ 2]; \ + h1l = (state)->H.wide[ 3]; \ + h2h = (state)->H.wide[ 4]; \ + h2l = (state)->H.wide[ 5]; \ + h3h = (state)->H.wide[ 6]; \ + h3l = (state)->H.wide[ 7]; \ + h4h = (state)->H.wide[ 8]; \ + h4l = (state)->H.wide[ 9]; \ + h5h = (state)->H.wide[10]; \ + h5l = (state)->H.wide[11]; \ + h6h = (state)->H.wide[12]; \ + h6l = (state)->H.wide[13]; \ + h7h = (state)->H.wide[14]; \ + h7l = (state)->H.wide[15]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->H.wide[ 0] = h0h; \ + (state)->H.wide[ 1] = h0l; \ + (state)->H.wide[ 2] = h1h; \ + (state)->H.wide[ 3] = h1l; \ + (state)->H.wide[ 4] = h2h; \ + (state)->H.wide[ 5] = h2l; \ + (state)->H.wide[ 6] = h3h; \ + (state)->H.wide[ 7] = h3l; \ + (state)->H.wide[ 8] = h4h; \ + (state)->H.wide[ 9] = h4l; \ + (state)->H.wide[10] = h5h; \ + (state)->H.wide[11] = h5l; \ + (state)->H.wide[12] = h6h; \ + (state)->H.wide[13] = h6l; \ + (state)->H.wide[14] = h7h; \ + (state)->H.wide[15] = h7l; \ + } while (0) + +#define INPUT_BUF1 \ + sph_u64 m0h = dec64e_aligned(buf + 0); \ + sph_u64 m0l = dec64e_aligned(buf + 8); \ + sph_u64 m1h = dec64e_aligned(buf + 16); \ + sph_u64 m1l = dec64e_aligned(buf + 24); \ + sph_u64 m2h = dec64e_aligned(buf + 32); \ + sph_u64 m2l = dec64e_aligned(buf + 40); \ + sph_u64 m3h = dec64e_aligned(buf + 48); \ + sph_u64 m3l = dec64e_aligned(buf + 56); \ + h0h ^= m0h; \ + h0l ^= m0l; \ + h1h ^= m1h; \ + h1l ^= m1l; \ + h2h ^= m2h; \ + h2l ^= m2l; \ + h3h ^= m3h; \ + h3l ^= m3l; + +#define INPUT_BUF2 \ + h4h ^= m0h; \ + h4l ^= m0l; \ + h5h ^= m1h; \ + h5l ^= m1l; \ + h6h ^= m2h; \ + h6l ^= m2l; \ + h7h ^= m3h; \ + h7l ^= m3l; + +static const sph_u64 IV224[] = { + C64e(0x2dfedd62f99a98ac), C64e(0xae7cacd619d634e7), + C64e(0xa4831005bc301216), C64e(0xb86038c6c9661494), + C64e(0x66d9899f2580706f), C64e(0xce9ea31b1d9b1adc), + C64e(0x11e8325f7b366e10), C64e(0xf994857f02fa06c1), + C64e(0x1b4f1b5cd8c840b3), C64e(0x97f6a17f6e738099), + C64e(0xdcdf93a5adeaa3d3), C64e(0xa431e8dec9539a68), + C64e(0x22b4a98aec86a1e4), C64e(0xd574ac959ce56cf0), + C64e(0x15960deab5ab2bbf), C64e(0x9611dcf0dd64ea6e) +}; + +static const sph_u64 IV256[] = { + C64e(0xeb98a3412c20d3eb), C64e(0x92cdbe7b9cb245c1), + C64e(0x1c93519160d4c7fa), C64e(0x260082d67e508a03), + C64e(0xa4239e267726b945), C64e(0xe0fb1a48d41a9477), + C64e(0xcdb5ab26026b177a), C64e(0x56f024420fff2fa8), + C64e(0x71a396897f2e4d75), C64e(0x1d144908f77de262), + C64e(0x277695f776248f94), C64e(0x87d5b6574780296c), + C64e(0x5c5e272dac8e0d6c), C64e(0x518450c657057a0f), + C64e(0x7be4d367702412ea), C64e(0x89e3ab13d31cd769) +}; + +static const sph_u64 IV384[] = { + C64e(0x481e3bc6d813398a), C64e(0x6d3b5e894ade879b), + C64e(0x63faea68d480ad2e), C64e(0x332ccb21480f8267), + C64e(0x98aec84d9082b928), C64e(0xd455ea3041114249), + C64e(0x36f555b2924847ec), C64e(0xc7250a93baf43ce1), + C64e(0x569b7f8a27db454c), C64e(0x9efcbd496397af0e), + C64e(0x589fc27d26aa80cd), C64e(0x80c08b8c9deb2eda), + C64e(0x8a7981e8f8d5373a), C64e(0xf43967adddd17a71), + C64e(0xa9b4d3bda475d394), C64e(0x976c3fba9842737f) +}; + +static const sph_u64 IV512[] = { + C64e(0x6fd14b963e00aa17), C64e(0x636a2e057a15d543), + C64e(0x8a225e8d0c97ef0b), C64e(0xe9341259f2b3c361), + C64e(0x891da0c1536f801e), C64e(0x2aa9056bea2b6d80), + C64e(0x588eccdb2075baa6), C64e(0xa90f3a76baf83bf7), + C64e(0x0169e60541e34a69), C64e(0x46b58a8e2e6fe65a), + C64e(0x1047a7d0c1843c24), C64e(0x3b6e71b12d5ac199), + C64e(0xcf57f6ec9db1f856), C64e(0xa706887c5716b156), + C64e(0xe3c2fcdfe68517fb), C64e(0x545a4678cc8cdd4b) +}; + +#else + +static const sph_u32 C[] = { + C32e(0x72d5dea2), C32e(0xdf15f867), C32e(0x7b84150a), + C32e(0xb7231557), C32e(0x81abd690), C32e(0x4d5a87f6), + C32e(0x4e9f4fc5), C32e(0xc3d12b40), C32e(0xea983ae0), + C32e(0x5c45fa9c), C32e(0x03c5d299), C32e(0x66b2999a), + C32e(0x660296b4), C32e(0xf2bb538a), C32e(0xb556141a), + C32e(0x88dba231), C32e(0x03a35a5c), C32e(0x9a190edb), + C32e(0x403fb20a), C32e(0x87c14410), C32e(0x1c051980), + C32e(0x849e951d), C32e(0x6f33ebad), C32e(0x5ee7cddc), + C32e(0x10ba1392), C32e(0x02bf6b41), C32e(0xdc786515), + C32e(0xf7bb27d0), C32e(0x0a2c8139), C32e(0x37aa7850), + C32e(0x3f1abfd2), C32e(0x410091d3), C32e(0x422d5a0d), + C32e(0xf6cc7e90), C32e(0xdd629f9c), C32e(0x92c097ce), + C32e(0x185ca70b), C32e(0xc72b44ac), C32e(0xd1df65d6), + C32e(0x63c6fc23), C32e(0x976e6c03), C32e(0x9ee0b81a), + C32e(0x2105457e), C32e(0x446ceca8), C32e(0xeef103bb), + C32e(0x5d8e61fa), C32e(0xfd9697b2), C32e(0x94838197), + C32e(0x4a8e8537), C32e(0xdb03302f), C32e(0x2a678d2d), + C32e(0xfb9f6a95), C32e(0x8afe7381), C32e(0xf8b8696c), + C32e(0x8ac77246), C32e(0xc07f4214), C32e(0xc5f4158f), + C32e(0xbdc75ec4), C32e(0x75446fa7), C32e(0x8f11bb80), + C32e(0x52de75b7), C32e(0xaee488bc), C32e(0x82b8001e), + C32e(0x98a6a3f4), C32e(0x8ef48f33), C32e(0xa9a36315), + C32e(0xaa5f5624), C32e(0xd5b7f989), C32e(0xb6f1ed20), + C32e(0x7c5ae0fd), C32e(0x36cae95a), C32e(0x06422c36), + C32e(0xce293543), C32e(0x4efe983d), C32e(0x533af974), + C32e(0x739a4ba7), C32e(0xd0f51f59), C32e(0x6f4e8186), + C32e(0x0e9dad81), C32e(0xafd85a9f), C32e(0xa7050667), + C32e(0xee34626a), C32e(0x8b0b28be), C32e(0x6eb91727), + C32e(0x47740726), C32e(0xc680103f), C32e(0xe0a07e6f), + C32e(0xc67e487b), C32e(0x0d550aa5), C32e(0x4af8a4c0), + C32e(0x91e3e79f), C32e(0x978ef19e), C32e(0x86767281), + C32e(0x50608dd4), C32e(0x7e9e5a41), C32e(0xf3e5b062), + C32e(0xfc9f1fec), C32e(0x4054207a), C32e(0xe3e41a00), + C32e(0xcef4c984), C32e(0x4fd794f5), C32e(0x9dfa95d8), + C32e(0x552e7e11), C32e(0x24c354a5), C32e(0x5bdf7228), + C32e(0xbdfe6e28), C32e(0x78f57fe2), C32e(0x0fa5c4b2), + C32e(0x05897cef), C32e(0xee49d32e), C32e(0x447e9385), + C32e(0xeb28597f), C32e(0x705f6937), C32e(0xb324314a), + C32e(0x5e8628f1), C32e(0x1dd6e465), C32e(0xc71b7704), + C32e(0x51b920e7), C32e(0x74fe43e8), C32e(0x23d4878a), + C32e(0x7d29e8a3), C32e(0x927694f2), C32e(0xddcb7a09), + C32e(0x9b30d9c1), C32e(0x1d1b30fb), C32e(0x5bdc1be0), + C32e(0xda24494f), C32e(0xf29c82bf), C32e(0xa4e7ba31), + C32e(0xb470bfff), C32e(0x0d324405), C32e(0xdef8bc48), + C32e(0x3baefc32), C32e(0x53bbd339), C32e(0x459fc3c1), + C32e(0xe0298ba0), C32e(0xe5c905fd), C32e(0xf7ae090f), + C32e(0x94703412), C32e(0x4290f134), C32e(0xa271b701), + C32e(0xe344ed95), C32e(0xe93b8e36), C32e(0x4f2f984a), + C32e(0x88401d63), C32e(0xa06cf615), C32e(0x47c1444b), + C32e(0x8752afff), C32e(0x7ebb4af1), C32e(0xe20ac630), + C32e(0x4670b6c5), C32e(0xcc6e8ce6), C32e(0xa4d5a456), + C32e(0xbd4fca00), C32e(0xda9d844b), C32e(0xc83e18ae), + C32e(0x7357ce45), C32e(0x3064d1ad), C32e(0xe8a6ce68), + C32e(0x145c2567), C32e(0xa3da8cf2), C32e(0xcb0ee116), + C32e(0x33e90658), C32e(0x9a94999a), C32e(0x1f60b220), + C32e(0xc26f847b), C32e(0xd1ceac7f), C32e(0xa0d18518), + C32e(0x32595ba1), C32e(0x8ddd19d3), C32e(0x509a1cc0), + C32e(0xaaa5b446), C32e(0x9f3d6367), C32e(0xe4046bba), + C32e(0xf6ca19ab), C32e(0x0b56ee7e), C32e(0x1fb179ea), + C32e(0xa9282174), C32e(0xe9bdf735), C32e(0x3b3651ee), + C32e(0x1d57ac5a), C32e(0x7550d376), C32e(0x3a46c2fe), + C32e(0xa37d7001), C32e(0xf735c1af), C32e(0x98a4d842), + C32e(0x78edec20), C32e(0x9e6b6779), C32e(0x41836315), + C32e(0xea3adba8), C32e(0xfac33b4d), C32e(0x32832c83), + C32e(0xa7403b1f), C32e(0x1c2747f3), C32e(0x5940f034), + C32e(0xb72d769a), C32e(0xe73e4e6c), C32e(0xd2214ffd), + C32e(0xb8fd8d39), C32e(0xdc5759ef), C32e(0x8d9b0c49), + C32e(0x2b49ebda), C32e(0x5ba2d749), C32e(0x68f3700d), + C32e(0x7d3baed0), C32e(0x7a8d5584), C32e(0xf5a5e9f0), + C32e(0xe4f88e65), C32e(0xa0b8a2f4), C32e(0x36103b53), + C32e(0x0ca8079e), C32e(0x753eec5a), C32e(0x91689492), + C32e(0x56e8884f), C32e(0x5bb05c55), C32e(0xf8babc4c), + C32e(0xe3bb3b99), C32e(0xf387947b), C32e(0x75daf4d6), + C32e(0x726b1c5d), C32e(0x64aeac28), C32e(0xdc34b36d), + C32e(0x6c34a550), C32e(0xb828db71), C32e(0xf861e2f2), + C32e(0x108d512a), C32e(0xe3db6433), C32e(0x59dd75fc), + C32e(0x1cacbcf1), C32e(0x43ce3fa2), C32e(0x67bbd13c), + C32e(0x02e843b0), C32e(0x330a5bca), C32e(0x8829a175), + C32e(0x7f34194d), C32e(0xb416535c), C32e(0x923b94c3), + C32e(0x0e794d1e), C32e(0x797475d7), C32e(0xb6eeaf3f), + C32e(0xeaa8d4f7), C32e(0xbe1a3921), C32e(0x5cf47e09), + C32e(0x4c232751), C32e(0x26a32453), C32e(0xba323cd2), + C32e(0x44a3174a), C32e(0x6da6d5ad), C32e(0xb51d3ea6), + C32e(0xaff2c908), C32e(0x83593d98), C32e(0x916b3c56), + C32e(0x4cf87ca1), C32e(0x7286604d), C32e(0x46e23ecc), + C32e(0x086ec7f6), C32e(0x2f9833b3), C32e(0xb1bc765e), + C32e(0x2bd666a5), C32e(0xefc4e62a), C32e(0x06f4b6e8), + C32e(0xbec1d436), C32e(0x74ee8215), C32e(0xbcef2163), + C32e(0xfdc14e0d), C32e(0xf453c969), C32e(0xa77d5ac4), + C32e(0x06585826), C32e(0x7ec11416), C32e(0x06e0fa16), + C32e(0x7e90af3d), C32e(0x28639d3f), C32e(0xd2c9f2e3), + C32e(0x009bd20c), C32e(0x5faace30), C32e(0xb7d40c30), + C32e(0x742a5116), C32e(0xf2e03298), C32e(0x0deb30d8), + C32e(0xe3cef89a), C32e(0x4bc59e7b), C32e(0xb5f17992), + C32e(0xff51e66e), C32e(0x048668d3), C32e(0x9b234d57), + C32e(0xe6966731), C32e(0xcce6a6f3), C32e(0x170a7505), + C32e(0xb17681d9), C32e(0x13326cce), C32e(0x3c175284), + C32e(0xf805a262), C32e(0xf42bcbb3), C32e(0x78471547), + C32e(0xff465482), C32e(0x23936a48), C32e(0x38df5807), + C32e(0x4e5e6565), C32e(0xf2fc7c89), C32e(0xfc86508e), + C32e(0x31702e44), C32e(0xd00bca86), C32e(0xf04009a2), + C32e(0x3078474e), C32e(0x65a0ee39), C32e(0xd1f73883), + C32e(0xf75ee937), C32e(0xe42c3abd), C32e(0x2197b226), + C32e(0x0113f86f), C32e(0xa344edd1), C32e(0xef9fdee7), + C32e(0x8ba0df15), C32e(0x762592d9), C32e(0x3c85f7f6), + C32e(0x12dc42be), C32e(0xd8a7ec7c), C32e(0xab27b07e), + C32e(0x538d7dda), C32e(0xaa3ea8de), C32e(0xaa25ce93), + C32e(0xbd0269d8), C32e(0x5af643fd), C32e(0x1a7308f9), + C32e(0xc05fefda), C32e(0x174a19a5), C32e(0x974d6633), + C32e(0x4cfd216a), C32e(0x35b49831), C32e(0xdb411570), + C32e(0xea1e0fbb), C32e(0xedcd549b), C32e(0x9ad063a1), + C32e(0x51974072), C32e(0xf6759dbf), C32e(0x91476fe2) +}; + +#define Ceven_w3(r) (C[((r) << 3) + 0]) +#define Ceven_w2(r) (C[((r) << 3) + 1]) +#define Ceven_w1(r) (C[((r) << 3) + 2]) +#define Ceven_w0(r) (C[((r) << 3) + 3]) +#define Codd_w3(r) (C[((r) << 3) + 4]) +#define Codd_w2(r) (C[((r) << 3) + 5]) +#define Codd_w1(r) (C[((r) << 3) + 6]) +#define Codd_w0(r) (C[((r) << 3) + 7]) + +#define S(x0, x1, x2, x3, cb, r) do { \ + Sb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, cb ## w3(r)); \ + Sb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, cb ## w2(r)); \ + Sb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, cb ## w1(r)); \ + Sb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, cb ## w0(r)); \ + } while (0) + +#define L(x0, x1, x2, x3, x4, x5, x6, x7) do { \ + Lb(x0 ## 3, x1 ## 3, x2 ## 3, x3 ## 3, \ + x4 ## 3, x5 ## 3, x6 ## 3, x7 ## 3); \ + Lb(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, \ + x4 ## 2, x5 ## 2, x6 ## 2, x7 ## 2); \ + Lb(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, \ + x4 ## 1, x5 ## 1, x6 ## 1, x7 ## 1); \ + Lb(x0 ## 0, x1 ## 0, x2 ## 0, x3 ## 0, \ + x4 ## 0, x5 ## 0, x6 ## 0, x7 ## 0); \ + } while (0) + +#define Wz(x, c, n) do { \ + sph_u32 t = (x ## 3 & (c)) << (n); \ + x ## 3 = ((x ## 3 >> (n)) & (c)) | t; \ + t = (x ## 2 & (c)) << (n); \ + x ## 2 = ((x ## 2 >> (n)) & (c)) | t; \ + t = (x ## 1 & (c)) << (n); \ + x ## 1 = ((x ## 1 >> (n)) & (c)) | t; \ + t = (x ## 0 & (c)) << (n); \ + x ## 0 = ((x ## 0 >> (n)) & (c)) | t; \ + } while (0) + +#define W0(x) Wz(x, SPH_C32(0x55555555), 1) +#define W1(x) Wz(x, SPH_C32(0x33333333), 2) +#define W2(x) Wz(x, SPH_C32(0x0F0F0F0F), 4) +#define W3(x) Wz(x, SPH_C32(0x00FF00FF), 8) +#define W4(x) Wz(x, SPH_C32(0x0000FFFF), 16) +#define W5(x) do { \ + sph_u32 t = x ## 3; \ + x ## 3 = x ## 2; \ + x ## 2 = t; \ + t = x ## 1; \ + x ## 1 = x ## 0; \ + x ## 0 = t; \ + } while (0) +#define W6(x) do { \ + sph_u32 t = x ## 3; \ + x ## 3 = x ## 1; \ + x ## 1 = t; \ + t = x ## 2; \ + x ## 2 = x ## 0; \ + x ## 0 = t; \ + } while (0) + +#define DECL_STATE \ + sph_u32 h03, h02, h01, h00, h13, h12, h11, h10; \ + sph_u32 h23, h22, h21, h20, h33, h32, h31, h30; \ + sph_u32 h43, h42, h41, h40, h53, h52, h51, h50; \ + sph_u32 h63, h62, h61, h60, h73, h72, h71, h70; \ + sph_u32 tmp; + +#define READ_STATE(state) do { \ + h03 = (state)->H.narrow[ 0]; \ + h02 = (state)->H.narrow[ 1]; \ + h01 = (state)->H.narrow[ 2]; \ + h00 = (state)->H.narrow[ 3]; \ + h13 = (state)->H.narrow[ 4]; \ + h12 = (state)->H.narrow[ 5]; \ + h11 = (state)->H.narrow[ 6]; \ + h10 = (state)->H.narrow[ 7]; \ + h23 = (state)->H.narrow[ 8]; \ + h22 = (state)->H.narrow[ 9]; \ + h21 = (state)->H.narrow[10]; \ + h20 = (state)->H.narrow[11]; \ + h33 = (state)->H.narrow[12]; \ + h32 = (state)->H.narrow[13]; \ + h31 = (state)->H.narrow[14]; \ + h30 = (state)->H.narrow[15]; \ + h43 = (state)->H.narrow[16]; \ + h42 = (state)->H.narrow[17]; \ + h41 = (state)->H.narrow[18]; \ + h40 = (state)->H.narrow[19]; \ + h53 = (state)->H.narrow[20]; \ + h52 = (state)->H.narrow[21]; \ + h51 = (state)->H.narrow[22]; \ + h50 = (state)->H.narrow[23]; \ + h63 = (state)->H.narrow[24]; \ + h62 = (state)->H.narrow[25]; \ + h61 = (state)->H.narrow[26]; \ + h60 = (state)->H.narrow[27]; \ + h73 = (state)->H.narrow[28]; \ + h72 = (state)->H.narrow[29]; \ + h71 = (state)->H.narrow[30]; \ + h70 = (state)->H.narrow[31]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->H.narrow[ 0] = h03; \ + (state)->H.narrow[ 1] = h02; \ + (state)->H.narrow[ 2] = h01; \ + (state)->H.narrow[ 3] = h00; \ + (state)->H.narrow[ 4] = h13; \ + (state)->H.narrow[ 5] = h12; \ + (state)->H.narrow[ 6] = h11; \ + (state)->H.narrow[ 7] = h10; \ + (state)->H.narrow[ 8] = h23; \ + (state)->H.narrow[ 9] = h22; \ + (state)->H.narrow[10] = h21; \ + (state)->H.narrow[11] = h20; \ + (state)->H.narrow[12] = h33; \ + (state)->H.narrow[13] = h32; \ + (state)->H.narrow[14] = h31; \ + (state)->H.narrow[15] = h30; \ + (state)->H.narrow[16] = h43; \ + (state)->H.narrow[17] = h42; \ + (state)->H.narrow[18] = h41; \ + (state)->H.narrow[19] = h40; \ + (state)->H.narrow[20] = h53; \ + (state)->H.narrow[21] = h52; \ + (state)->H.narrow[22] = h51; \ + (state)->H.narrow[23] = h50; \ + (state)->H.narrow[24] = h63; \ + (state)->H.narrow[25] = h62; \ + (state)->H.narrow[26] = h61; \ + (state)->H.narrow[27] = h60; \ + (state)->H.narrow[28] = h73; \ + (state)->H.narrow[29] = h72; \ + (state)->H.narrow[30] = h71; \ + (state)->H.narrow[31] = h70; \ + } while (0) + +#define INPUT_BUF1 \ + sph_u32 m03 = dec32e_aligned(buf + 0); \ + sph_u32 m02 = dec32e_aligned(buf + 4); \ + sph_u32 m01 = dec32e_aligned(buf + 8); \ + sph_u32 m00 = dec32e_aligned(buf + 12); \ + sph_u32 m13 = dec32e_aligned(buf + 16); \ + sph_u32 m12 = dec32e_aligned(buf + 20); \ + sph_u32 m11 = dec32e_aligned(buf + 24); \ + sph_u32 m10 = dec32e_aligned(buf + 28); \ + sph_u32 m23 = dec32e_aligned(buf + 32); \ + sph_u32 m22 = dec32e_aligned(buf + 36); \ + sph_u32 m21 = dec32e_aligned(buf + 40); \ + sph_u32 m20 = dec32e_aligned(buf + 44); \ + sph_u32 m33 = dec32e_aligned(buf + 48); \ + sph_u32 m32 = dec32e_aligned(buf + 52); \ + sph_u32 m31 = dec32e_aligned(buf + 56); \ + sph_u32 m30 = dec32e_aligned(buf + 60); \ + h03 ^= m03; \ + h02 ^= m02; \ + h01 ^= m01; \ + h00 ^= m00; \ + h13 ^= m13; \ + h12 ^= m12; \ + h11 ^= m11; \ + h10 ^= m10; \ + h23 ^= m23; \ + h22 ^= m22; \ + h21 ^= m21; \ + h20 ^= m20; \ + h33 ^= m33; \ + h32 ^= m32; \ + h31 ^= m31; \ + h30 ^= m30; + +#define INPUT_BUF2 \ + h43 ^= m03; \ + h42 ^= m02; \ + h41 ^= m01; \ + h40 ^= m00; \ + h53 ^= m13; \ + h52 ^= m12; \ + h51 ^= m11; \ + h50 ^= m10; \ + h63 ^= m23; \ + h62 ^= m22; \ + h61 ^= m21; \ + h60 ^= m20; \ + h73 ^= m33; \ + h72 ^= m32; \ + h71 ^= m31; \ + h70 ^= m30; + +static const sph_u32 IV224[] = { + C32e(0x2dfedd62), C32e(0xf99a98ac), C32e(0xae7cacd6), C32e(0x19d634e7), + C32e(0xa4831005), C32e(0xbc301216), C32e(0xb86038c6), C32e(0xc9661494), + C32e(0x66d9899f), C32e(0x2580706f), C32e(0xce9ea31b), C32e(0x1d9b1adc), + C32e(0x11e8325f), C32e(0x7b366e10), C32e(0xf994857f), C32e(0x02fa06c1), + C32e(0x1b4f1b5c), C32e(0xd8c840b3), C32e(0x97f6a17f), C32e(0x6e738099), + C32e(0xdcdf93a5), C32e(0xadeaa3d3), C32e(0xa431e8de), C32e(0xc9539a68), + C32e(0x22b4a98a), C32e(0xec86a1e4), C32e(0xd574ac95), C32e(0x9ce56cf0), + C32e(0x15960dea), C32e(0xb5ab2bbf), C32e(0x9611dcf0), C32e(0xdd64ea6e) +}; + +static const sph_u32 IV256[] = { + C32e(0xeb98a341), C32e(0x2c20d3eb), C32e(0x92cdbe7b), C32e(0x9cb245c1), + C32e(0x1c935191), C32e(0x60d4c7fa), C32e(0x260082d6), C32e(0x7e508a03), + C32e(0xa4239e26), C32e(0x7726b945), C32e(0xe0fb1a48), C32e(0xd41a9477), + C32e(0xcdb5ab26), C32e(0x026b177a), C32e(0x56f02442), C32e(0x0fff2fa8), + C32e(0x71a39689), C32e(0x7f2e4d75), C32e(0x1d144908), C32e(0xf77de262), + C32e(0x277695f7), C32e(0x76248f94), C32e(0x87d5b657), C32e(0x4780296c), + C32e(0x5c5e272d), C32e(0xac8e0d6c), C32e(0x518450c6), C32e(0x57057a0f), + C32e(0x7be4d367), C32e(0x702412ea), C32e(0x89e3ab13), C32e(0xd31cd769) +}; + +static const sph_u32 IV384[] = { + C32e(0x481e3bc6), C32e(0xd813398a), C32e(0x6d3b5e89), C32e(0x4ade879b), + C32e(0x63faea68), C32e(0xd480ad2e), C32e(0x332ccb21), C32e(0x480f8267), + C32e(0x98aec84d), C32e(0x9082b928), C32e(0xd455ea30), C32e(0x41114249), + C32e(0x36f555b2), C32e(0x924847ec), C32e(0xc7250a93), C32e(0xbaf43ce1), + C32e(0x569b7f8a), C32e(0x27db454c), C32e(0x9efcbd49), C32e(0x6397af0e), + C32e(0x589fc27d), C32e(0x26aa80cd), C32e(0x80c08b8c), C32e(0x9deb2eda), + C32e(0x8a7981e8), C32e(0xf8d5373a), C32e(0xf43967ad), C32e(0xddd17a71), + C32e(0xa9b4d3bd), C32e(0xa475d394), C32e(0x976c3fba), C32e(0x9842737f) +}; + +static const sph_u32 IV512[] = { + C32e(0x6fd14b96), C32e(0x3e00aa17), C32e(0x636a2e05), C32e(0x7a15d543), + C32e(0x8a225e8d), C32e(0x0c97ef0b), C32e(0xe9341259), C32e(0xf2b3c361), + C32e(0x891da0c1), C32e(0x536f801e), C32e(0x2aa9056b), C32e(0xea2b6d80), + C32e(0x588eccdb), C32e(0x2075baa6), C32e(0xa90f3a76), C32e(0xbaf83bf7), + C32e(0x0169e605), C32e(0x41e34a69), C32e(0x46b58a8e), C32e(0x2e6fe65a), + C32e(0x1047a7d0), C32e(0xc1843c24), C32e(0x3b6e71b1), C32e(0x2d5ac199), + C32e(0xcf57f6ec), C32e(0x9db1f856), C32e(0xa706887c), C32e(0x5716b156), + C32e(0xe3c2fcdf), C32e(0xe68517fb), C32e(0x545a4678), C32e(0xcc8cdd4b) +}; + +#endif + +#define SL(ro) SLu(r + ro, ro) + +#define SLu(r, ro) do { \ + S(h0, h2, h4, h6, Ceven_, r); \ + S(h1, h3, h5, h7, Codd_, r); \ + L(h0, h2, h4, h6, h1, h3, h5, h7); \ + W ## ro(h1); \ + W ## ro(h3); \ + W ## ro(h5); \ + W ## ro(h7); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_JH + +#if SPH_JH_64 + +/* + * The "small footprint" 64-bit version just uses a partially unrolled + * loop. + */ + +#define E8 do { \ + unsigned r; \ + for (r = 0; r < 42; r += 7) { \ + SL(0); \ + SL(1); \ + SL(2); \ + SL(3); \ + SL(4); \ + SL(5); \ + SL(6); \ + } \ + } while (0) + +#else + +#define E8 do { \ + unsigned r, g; \ + for (r = g = 0; r < 42; r ++) { \ + S(h0, h2, h4, h6, Ceven_, r); \ + S(h1, h3, h5, h7, Codd_, r); \ + L(h0, h2, h4, h6, h1, h3, h5, h7); \ + switch (g) { \ + case 0: \ + W0(h1); \ + W0(h3); \ + W0(h5); \ + W0(h7); \ + break; \ + case 1: \ + W1(h1); \ + W1(h3); \ + W1(h5); \ + W1(h7); \ + break; \ + case 2: \ + W2(h1); \ + W2(h3); \ + W2(h5); \ + W2(h7); \ + break; \ + case 3: \ + W3(h1); \ + W3(h3); \ + W3(h5); \ + W3(h7); \ + break; \ + case 4: \ + W4(h1); \ + W4(h3); \ + W4(h5); \ + W4(h7); \ + break; \ + case 5: \ + W5(h1); \ + W5(h3); \ + W5(h5); \ + W5(h7); \ + break; \ + case 6: \ + W6(h1); \ + W6(h3); \ + W6(h5); \ + W6(h7); \ + break; \ + } \ + if (++ g == 7) \ + g = 0; \ + } \ + } while (0) + +#endif + +#else + +#if SPH_JH_64 + +/* + * On a "true 64-bit" architecture, we can unroll at will. + */ + +#define E8 do { \ + SLu( 0, 0); \ + SLu( 1, 1); \ + SLu( 2, 2); \ + SLu( 3, 3); \ + SLu( 4, 4); \ + SLu( 5, 5); \ + SLu( 6, 6); \ + SLu( 7, 0); \ + SLu( 8, 1); \ + SLu( 9, 2); \ + SLu(10, 3); \ + SLu(11, 4); \ + SLu(12, 5); \ + SLu(13, 6); \ + SLu(14, 0); \ + SLu(15, 1); \ + SLu(16, 2); \ + SLu(17, 3); \ + SLu(18, 4); \ + SLu(19, 5); \ + SLu(20, 6); \ + SLu(21, 0); \ + SLu(22, 1); \ + SLu(23, 2); \ + SLu(24, 3); \ + SLu(25, 4); \ + SLu(26, 5); \ + SLu(27, 6); \ + SLu(28, 0); \ + SLu(29, 1); \ + SLu(30, 2); \ + SLu(31, 3); \ + SLu(32, 4); \ + SLu(33, 5); \ + SLu(34, 6); \ + SLu(35, 0); \ + SLu(36, 1); \ + SLu(37, 2); \ + SLu(38, 3); \ + SLu(39, 4); \ + SLu(40, 5); \ + SLu(41, 6); \ + } while (0) + +#else + +/* + * We are not aiming at a small footprint, but we are still using a + * 32-bit implementation. Full loop unrolling would smash the L1 + * cache on some "big" architectures (32 kB L1 cache). + */ + +#define E8 do { \ + unsigned r; \ + for (r = 0; r < 42; r += 7) { \ + SL(0); \ + SL(1); \ + SL(2); \ + SL(3); \ + SL(4); \ + SL(5); \ + SL(6); \ + } \ + } while (0) + +#endif + +#endif + +static void +jh_init(sph_jh_context *sc, const void *iv) +{ + sc->ptr = 0; +#if SPH_JH_64 + memcpy(sc->H.wide, iv, sizeof sc->H.wide); +#else + memcpy(sc->H.narrow, iv, sizeof sc->H.narrow); +#endif +#if SPH_64 + sc->block_count = 0; +#else + sc->block_count_high = 0; + sc->block_count_low = 0; +#endif +} + +static void +jh_core(sph_jh_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + INPUT_BUF1; + E8; + INPUT_BUF2; +#if SPH_64 + sc->block_count ++; +#else + if ((sc->block_count_low = SPH_T32( + sc->block_count_low + 1)) == 0) + sc->block_count_high ++; +#endif + ptr = 0; + } + } + WRITE_STATE(sc); + sc->ptr = ptr; +} + +static void +jh_close(sph_jh_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32, const void *iv) +{ + unsigned z; + unsigned char buf[128]; + size_t numz, u; +#if SPH_64 + sph_u64 l0, l1; +#else + sph_u32 l0, l1, l2, l3; +#endif + + z = 0x80 >> n; + buf[0] = ((ub & -z) | z) & 0xFF; + if (sc->ptr == 0 && n == 0) { + numz = 47; + } else { + numz = 111 - sc->ptr; + } + memset(buf + 1, 0, numz); +#if SPH_64 + l0 = SPH_T64(sc->block_count << 9) + (sc->ptr << 3) + n; + l1 = SPH_T64(sc->block_count >> 55); + sph_enc64be(buf + numz + 1, l1); + sph_enc64be(buf + numz + 9, l0); +#else + l0 = SPH_T32(sc->block_count_low << 9) + (sc->ptr << 3) + n; + l1 = SPH_T32(sc->block_count_low >> 23) + + SPH_T32(sc->block_count_high << 9); + l2 = SPH_T32(sc->block_count_high >> 23); + l3 = 0; + sph_enc32be(buf + numz + 1, l3); + sph_enc32be(buf + numz + 5, l2); + sph_enc32be(buf + numz + 9, l1); + sph_enc32be(buf + numz + 13, l0); +#endif + jh_core(sc, buf, numz + 17); +#if SPH_JH_64 + for (u = 0; u < 8; u ++) + enc64e(buf + (u << 3), sc->H.wide[u + 8]); +#else + for (u = 0; u < 16; u ++) + enc32e(buf + (u << 2), sc->H.narrow[u + 16]); +#endif + memcpy(dst, buf + ((16 - out_size_w32) << 2), out_size_w32 << 2); + jh_init(sc, iv); +} + +/* see sph_jh.h */ +void +sph_jh224_init(void *cc) +{ + jh_init(cc, IV224); +} + +/* see sph_jh.h */ +void +sph_jh224(void *cc, const void *data, size_t len) +{ + jh_core(cc, data, len); +} + +/* see sph_jh.h */ +void +sph_jh224_close(void *cc, void *dst) +{ + jh_close(cc, 0, 0, dst, 7, IV224); +} + +/* see sph_jh.h */ +void +sph_jh224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + jh_close(cc, ub, n, dst, 7, IV224); +} + +/* see sph_jh.h */ +void +sph_jh256_init(void *cc) +{ + jh_init(cc, IV256); +} + +/* see sph_jh.h */ +void +sph_jh256(void *cc, const void *data, size_t len) +{ + jh_core(cc, data, len); +} + +/* see sph_jh.h */ +void +sph_jh256_close(void *cc, void *dst) +{ + jh_close(cc, 0, 0, dst, 8, IV256); +} + +/* see sph_jh.h */ +void +sph_jh256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + jh_close(cc, ub, n, dst, 8, IV256); +} + +/* see sph_jh.h */ +void +sph_jh384_init(void *cc) +{ + jh_init(cc, IV384); +} + +/* see sph_jh.h */ +void +sph_jh384(void *cc, const void *data, size_t len) +{ + jh_core(cc, data, len); +} + +/* see sph_jh.h */ +void +sph_jh384_close(void *cc, void *dst) +{ + jh_close(cc, 0, 0, dst, 12, IV384); +} + +/* see sph_jh.h */ +void +sph_jh384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + jh_close(cc, ub, n, dst, 12, IV384); +} + +/* see sph_jh.h */ +void +sph_jh512_init(void *cc) +{ + jh_init(cc, IV512); +} + +/* see sph_jh.h */ +void +sph_jh512(void *cc, const void *data, size_t len) +{ + jh_core(cc, data, len); +} + +/* see sph_jh.h */ +void +sph_jh512_close(void *cc, void *dst) +{ + jh_close(cc, 0, 0, dst, 16, IV512); +} + +/* see sph_jh.h */ +void +sph_jh512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + jh_close(cc, ub, n, dst, 16, IV512); +} + +#ifdef __cplusplus +} +#endif diff --git a/sha3/sph_jh.h b/sha3/sph_jh.h new file mode 100644 index 00000000..82fae58d --- /dev/null +++ b/sha3/sph_jh.h @@ -0,0 +1,298 @@ +/* $Id: sph_jh.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * JH interface. JH is a family of functions which differ by + * their output size; this implementation defines JH for output + * sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_jh.h + * @author Thomas Pornin + */ + +#ifndef SPH_JH_H__ +#define SPH_JH_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for JH-224. + */ +#define SPH_SIZE_jh224 224 + +/** + * Output size (in bits) for JH-256. + */ +#define SPH_SIZE_jh256 256 + +/** + * Output size (in bits) for JH-384. + */ +#define SPH_SIZE_jh384 384 + +/** + * Output size (in bits) for JH-512. + */ +#define SPH_SIZE_jh512 512 + +/** + * This structure is a context for JH computations: it contains the + * intermediate values and some data from the last entered block. Once + * a JH computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running JH computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + union { +#if SPH_64 + sph_u64 wide[16]; +#endif + sph_u32 narrow[32]; + } H; +#if SPH_64 + sph_u64 block_count; +#else + sph_u32 block_count_high, block_count_low; +#endif +#endif +} sph_jh_context; + +/** + * Type for a JH-224 context (identical to the common context). + */ +typedef sph_jh_context sph_jh224_context; + +/** + * Type for a JH-256 context (identical to the common context). + */ +typedef sph_jh_context sph_jh256_context; + +/** + * Type for a JH-384 context (identical to the common context). + */ +typedef sph_jh_context sph_jh384_context; + +/** + * Type for a JH-512 context (identical to the common context). + */ +typedef sph_jh_context sph_jh512_context; + +/** + * Initialize a JH-224 context. This process performs no memory allocation. + * + * @param cc the JH-224 context (pointer to a + * sph_jh224_context) + */ +void sph_jh224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the JH-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_jh224(void *cc, const void *data, size_t len); + +/** + * Terminate the current JH-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the JH-224 context + * @param dst the destination buffer + */ +void sph_jh224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the JH-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_jh224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a JH-256 context. This process performs no memory allocation. + * + * @param cc the JH-256 context (pointer to a + * sph_jh256_context) + */ +void sph_jh256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the JH-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_jh256(void *cc, const void *data, size_t len); + +/** + * Terminate the current JH-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the JH-256 context + * @param dst the destination buffer + */ +void sph_jh256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the JH-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_jh256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a JH-384 context. This process performs no memory allocation. + * + * @param cc the JH-384 context (pointer to a + * sph_jh384_context) + */ +void sph_jh384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the JH-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_jh384(void *cc, const void *data, size_t len); + +/** + * Terminate the current JH-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the JH-384 context + * @param dst the destination buffer + */ +void sph_jh384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the JH-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_jh384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a JH-512 context. This process performs no memory allocation. + * + * @param cc the JH-512 context (pointer to a + * sph_jh512_context) + */ +void sph_jh512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the JH-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_jh512(void *cc, const void *data, size_t len); + +/** + * Terminate the current JH-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the JH-512 context + * @param dst the destination buffer + */ +void sph_jh512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the JH-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_jh512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_keccak.c b/sha3/sph_keccak.c new file mode 100644 index 00000000..cff9f87d --- /dev/null +++ b/sha3/sph_keccak.c @@ -0,0 +1,1824 @@ +/* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */ +/* + * Keccak implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_keccak.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +/* + * Parameters: + * + * SPH_KECCAK_64 use a 64-bit type + * SPH_KECCAK_UNROLL number of loops to unroll (0/undef for full unroll) + * SPH_KECCAK_INTERLEAVE use bit-interleaving (32-bit type only) + * SPH_KECCAK_NOCOPY do not copy the state into local variables + * + * If there is no usable 64-bit type, the code automatically switches + * back to the 32-bit implementation. + * + * Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1 + * code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core + * (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302, + * 8 kB L1 code cache), seem to show that the following are optimal: + * + * -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds, + * do not copy the state; unrolling 2, 6 or all rounds also provides + * near-optimal performance. + * -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds, + * interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds + * also provides near-optimal performance. + * -- PowerPC: use the 64-bit implementation, unroll 8 rounds, + * copy the state. Unrolling 4 or 6 rounds is near-optimal. + * -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds, + * copy the state. + * -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy + * the state. Unrolling only 1 round is also near-optimal. + * + * Also, interleaving does not always yield actual improvements when + * using a 32-bit implementation; in particular when the architecture + * does not offer a native rotation opcode (interleaving replaces one + * 64-bit rotation with two 32-bit rotations, which is a gain only if + * there is a native 32-bit rotation opcode and not a native 64-bit + * rotation opcode; also, interleaving implies a small overhead when + * processing input words). + * + * To sum up: + * -- when possible, use the 64-bit code + * -- exception: on 32-bit x86, use 32-bit code + * -- when using 32-bit code, use interleaving + * -- copy the state, except on x86 + * -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines + */ + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_KECCAK +#define SPH_SMALL_FOOTPRINT_KECCAK 1 +#endif + +/* + * By default, we select the 64-bit implementation if a 64-bit type + * is available, unless a 32-bit x86 is detected. + */ +#if !defined SPH_KECCAK_64 && SPH_64 \ + && !(defined __i386__ || SPH_I386_GCC || SPH_I386_MSVC) +#define SPH_KECCAK_64 1 +#endif + +/* + * If using a 32-bit implementation, we prefer to interleave. + */ +#if !SPH_KECCAK_64 && !defined SPH_KECCAK_INTERLEAVE +#define SPH_KECCAK_INTERLEAVE 1 +#endif + +/* + * Unroll 8 rounds on big systems, 2 rounds on small systems. + */ +#ifndef SPH_KECCAK_UNROLL +#if SPH_SMALL_FOOTPRINT_KECCAK +#define SPH_KECCAK_UNROLL 2 +#else +#define SPH_KECCAK_UNROLL 8 +#endif +#endif + +/* + * We do not want to copy the state to local variables on x86 (32-bit + * and 64-bit alike). + */ +#ifndef SPH_KECCAK_NOCOPY +#if defined __i386__ || defined __x86_64 || SPH_I386_MSVC || SPH_I386_GCC +#define SPH_KECCAK_NOCOPY 1 +#else +#define SPH_KECCAK_NOCOPY 0 +#endif +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#if SPH_KECCAK_64 + +static const sph_u64 RC[] = { + SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), + SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), + SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), + SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), + SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), + SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), + SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), + SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), + SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), + SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), + SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), + SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) +}; + +#if SPH_KECCAK_NOCOPY + +#define a00 (kc->u.wide[ 0]) +#define a10 (kc->u.wide[ 1]) +#define a20 (kc->u.wide[ 2]) +#define a30 (kc->u.wide[ 3]) +#define a40 (kc->u.wide[ 4]) +#define a01 (kc->u.wide[ 5]) +#define a11 (kc->u.wide[ 6]) +#define a21 (kc->u.wide[ 7]) +#define a31 (kc->u.wide[ 8]) +#define a41 (kc->u.wide[ 9]) +#define a02 (kc->u.wide[10]) +#define a12 (kc->u.wide[11]) +#define a22 (kc->u.wide[12]) +#define a32 (kc->u.wide[13]) +#define a42 (kc->u.wide[14]) +#define a03 (kc->u.wide[15]) +#define a13 (kc->u.wide[16]) +#define a23 (kc->u.wide[17]) +#define a33 (kc->u.wide[18]) +#define a43 (kc->u.wide[19]) +#define a04 (kc->u.wide[20]) +#define a14 (kc->u.wide[21]) +#define a24 (kc->u.wide[22]) +#define a34 (kc->u.wide[23]) +#define a44 (kc->u.wide[24]) + +#define DECL_STATE +#define READ_STATE(sc) +#define WRITE_STATE(sc) + +#define INPUT_BUF(size) do { \ + size_t j; \ + for (j = 0; j < (size); j += 8) { \ + kc->u.wide[j >> 3] ^= sph_dec64le_aligned(buf + j); \ + } \ + } while (0) + +#define INPUT_BUF144 INPUT_BUF(144) +#define INPUT_BUF136 INPUT_BUF(136) +#define INPUT_BUF104 INPUT_BUF(104) +#define INPUT_BUF72 INPUT_BUF(72) + +#else + +#define DECL_STATE \ + sph_u64 a00, a01, a02, a03, a04; \ + sph_u64 a10, a11, a12, a13, a14; \ + sph_u64 a20, a21, a22, a23, a24; \ + sph_u64 a30, a31, a32, a33, a34; \ + sph_u64 a40, a41, a42, a43, a44; + +#define READ_STATE(state) do { \ + a00 = (state)->u.wide[ 0]; \ + a10 = (state)->u.wide[ 1]; \ + a20 = (state)->u.wide[ 2]; \ + a30 = (state)->u.wide[ 3]; \ + a40 = (state)->u.wide[ 4]; \ + a01 = (state)->u.wide[ 5]; \ + a11 = (state)->u.wide[ 6]; \ + a21 = (state)->u.wide[ 7]; \ + a31 = (state)->u.wide[ 8]; \ + a41 = (state)->u.wide[ 9]; \ + a02 = (state)->u.wide[10]; \ + a12 = (state)->u.wide[11]; \ + a22 = (state)->u.wide[12]; \ + a32 = (state)->u.wide[13]; \ + a42 = (state)->u.wide[14]; \ + a03 = (state)->u.wide[15]; \ + a13 = (state)->u.wide[16]; \ + a23 = (state)->u.wide[17]; \ + a33 = (state)->u.wide[18]; \ + a43 = (state)->u.wide[19]; \ + a04 = (state)->u.wide[20]; \ + a14 = (state)->u.wide[21]; \ + a24 = (state)->u.wide[22]; \ + a34 = (state)->u.wide[23]; \ + a44 = (state)->u.wide[24]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->u.wide[ 0] = a00; \ + (state)->u.wide[ 1] = a10; \ + (state)->u.wide[ 2] = a20; \ + (state)->u.wide[ 3] = a30; \ + (state)->u.wide[ 4] = a40; \ + (state)->u.wide[ 5] = a01; \ + (state)->u.wide[ 6] = a11; \ + (state)->u.wide[ 7] = a21; \ + (state)->u.wide[ 8] = a31; \ + (state)->u.wide[ 9] = a41; \ + (state)->u.wide[10] = a02; \ + (state)->u.wide[11] = a12; \ + (state)->u.wide[12] = a22; \ + (state)->u.wide[13] = a32; \ + (state)->u.wide[14] = a42; \ + (state)->u.wide[15] = a03; \ + (state)->u.wide[16] = a13; \ + (state)->u.wide[17] = a23; \ + (state)->u.wide[18] = a33; \ + (state)->u.wide[19] = a43; \ + (state)->u.wide[20] = a04; \ + (state)->u.wide[21] = a14; \ + (state)->u.wide[22] = a24; \ + (state)->u.wide[23] = a34; \ + (state)->u.wide[24] = a44; \ + } while (0) + +#define INPUT_BUF144 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + a32 ^= sph_dec64le_aligned(buf + 104); \ + a42 ^= sph_dec64le_aligned(buf + 112); \ + a03 ^= sph_dec64le_aligned(buf + 120); \ + a13 ^= sph_dec64le_aligned(buf + 128); \ + a23 ^= sph_dec64le_aligned(buf + 136); \ + } while (0) + +#define INPUT_BUF136 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + a32 ^= sph_dec64le_aligned(buf + 104); \ + a42 ^= sph_dec64le_aligned(buf + 112); \ + a03 ^= sph_dec64le_aligned(buf + 120); \ + a13 ^= sph_dec64le_aligned(buf + 128); \ + } while (0) + +#define INPUT_BUF104 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + } while (0) + +#define INPUT_BUF72 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + } while (0) + +#define INPUT_BUF(lim) do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + if ((lim) == 72) \ + break; \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + if ((lim) == 104) \ + break; \ + a32 ^= sph_dec64le_aligned(buf + 104); \ + a42 ^= sph_dec64le_aligned(buf + 112); \ + a03 ^= sph_dec64le_aligned(buf + 120); \ + a13 ^= sph_dec64le_aligned(buf + 128); \ + if ((lim) == 136) \ + break; \ + a23 ^= sph_dec64le_aligned(buf + 136); \ + } while (0) + +#endif + +#define DECL64(x) sph_u64 x +#define MOV64(d, s) (d = s) +#define XOR64(d, a, b) (d = a ^ b) +#define AND64(d, a, b) (d = a & b) +#define OR64(d, a, b) (d = a | b) +#define NOT64(d, s) (d = SPH_T64(~s)) +#define ROL64(d, v, n) (d = SPH_ROTL64(v, n)) +#define XOR64_IOTA XOR64 + +#else + +static const struct { + sph_u32 high, low; +} RC[] = { +#if SPH_KECCAK_INTERLEAVE + { SPH_C32(0x00000000), SPH_C32(0x00000001) }, + { SPH_C32(0x00000089), SPH_C32(0x00000000) }, + { SPH_C32(0x8000008B), SPH_C32(0x00000000) }, + { SPH_C32(0x80008080), SPH_C32(0x00000000) }, + { SPH_C32(0x0000008B), SPH_C32(0x00000001) }, + { SPH_C32(0x00008000), SPH_C32(0x00000001) }, + { SPH_C32(0x80008088), SPH_C32(0x00000001) }, + { SPH_C32(0x80000082), SPH_C32(0x00000001) }, + { SPH_C32(0x0000000B), SPH_C32(0x00000000) }, + { SPH_C32(0x0000000A), SPH_C32(0x00000000) }, + { SPH_C32(0x00008082), SPH_C32(0x00000001) }, + { SPH_C32(0x00008003), SPH_C32(0x00000000) }, + { SPH_C32(0x0000808B), SPH_C32(0x00000001) }, + { SPH_C32(0x8000000B), SPH_C32(0x00000001) }, + { SPH_C32(0x8000008A), SPH_C32(0x00000001) }, + { SPH_C32(0x80000081), SPH_C32(0x00000001) }, + { SPH_C32(0x80000081), SPH_C32(0x00000000) }, + { SPH_C32(0x80000008), SPH_C32(0x00000000) }, + { SPH_C32(0x00000083), SPH_C32(0x00000000) }, + { SPH_C32(0x80008003), SPH_C32(0x00000000) }, + { SPH_C32(0x80008088), SPH_C32(0x00000001) }, + { SPH_C32(0x80000088), SPH_C32(0x00000000) }, + { SPH_C32(0x00008000), SPH_C32(0x00000001) }, + { SPH_C32(0x80008082), SPH_C32(0x00000000) } +#else + { SPH_C32(0x00000000), SPH_C32(0x00000001) }, + { SPH_C32(0x00000000), SPH_C32(0x00008082) }, + { SPH_C32(0x80000000), SPH_C32(0x0000808A) }, + { SPH_C32(0x80000000), SPH_C32(0x80008000) }, + { SPH_C32(0x00000000), SPH_C32(0x0000808B) }, + { SPH_C32(0x00000000), SPH_C32(0x80000001) }, + { SPH_C32(0x80000000), SPH_C32(0x80008081) }, + { SPH_C32(0x80000000), SPH_C32(0x00008009) }, + { SPH_C32(0x00000000), SPH_C32(0x0000008A) }, + { SPH_C32(0x00000000), SPH_C32(0x00000088) }, + { SPH_C32(0x00000000), SPH_C32(0x80008009) }, + { SPH_C32(0x00000000), SPH_C32(0x8000000A) }, + { SPH_C32(0x00000000), SPH_C32(0x8000808B) }, + { SPH_C32(0x80000000), SPH_C32(0x0000008B) }, + { SPH_C32(0x80000000), SPH_C32(0x00008089) }, + { SPH_C32(0x80000000), SPH_C32(0x00008003) }, + { SPH_C32(0x80000000), SPH_C32(0x00008002) }, + { SPH_C32(0x80000000), SPH_C32(0x00000080) }, + { SPH_C32(0x00000000), SPH_C32(0x0000800A) }, + { SPH_C32(0x80000000), SPH_C32(0x8000000A) }, + { SPH_C32(0x80000000), SPH_C32(0x80008081) }, + { SPH_C32(0x80000000), SPH_C32(0x00008080) }, + { SPH_C32(0x00000000), SPH_C32(0x80000001) }, + { SPH_C32(0x80000000), SPH_C32(0x80008008) } +#endif +}; + +#if SPH_KECCAK_INTERLEAVE + +#define INTERLEAVE(xl, xh) do { \ + sph_u32 l, h, t; \ + l = (xl); h = (xh); \ + t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \ + t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \ + t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \ + t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \ + t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \ + t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \ + t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \ + t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \ + t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \ + l ^= t; h ^= t >> 16; \ + (xl) = l; (xh) = h; \ + } while (0) + +#define UNINTERLEAVE(xl, xh) do { \ + sph_u32 l, h, t; \ + l = (xl); h = (xh); \ + t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \ + l ^= t; h ^= t >> 16; \ + t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \ + t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \ + t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \ + t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \ + t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \ + t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \ + t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \ + t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \ + (xl) = l; (xh) = h; \ + } while (0) + +#else + +#define INTERLEAVE(l, h) +#define UNINTERLEAVE(l, h) + +#endif + +#if SPH_KECCAK_NOCOPY + +#define a00l (kc->u.narrow[2 * 0 + 0]) +#define a00h (kc->u.narrow[2 * 0 + 1]) +#define a10l (kc->u.narrow[2 * 1 + 0]) +#define a10h (kc->u.narrow[2 * 1 + 1]) +#define a20l (kc->u.narrow[2 * 2 + 0]) +#define a20h (kc->u.narrow[2 * 2 + 1]) +#define a30l (kc->u.narrow[2 * 3 + 0]) +#define a30h (kc->u.narrow[2 * 3 + 1]) +#define a40l (kc->u.narrow[2 * 4 + 0]) +#define a40h (kc->u.narrow[2 * 4 + 1]) +#define a01l (kc->u.narrow[2 * 5 + 0]) +#define a01h (kc->u.narrow[2 * 5 + 1]) +#define a11l (kc->u.narrow[2 * 6 + 0]) +#define a11h (kc->u.narrow[2 * 6 + 1]) +#define a21l (kc->u.narrow[2 * 7 + 0]) +#define a21h (kc->u.narrow[2 * 7 + 1]) +#define a31l (kc->u.narrow[2 * 8 + 0]) +#define a31h (kc->u.narrow[2 * 8 + 1]) +#define a41l (kc->u.narrow[2 * 9 + 0]) +#define a41h (kc->u.narrow[2 * 9 + 1]) +#define a02l (kc->u.narrow[2 * 10 + 0]) +#define a02h (kc->u.narrow[2 * 10 + 1]) +#define a12l (kc->u.narrow[2 * 11 + 0]) +#define a12h (kc->u.narrow[2 * 11 + 1]) +#define a22l (kc->u.narrow[2 * 12 + 0]) +#define a22h (kc->u.narrow[2 * 12 + 1]) +#define a32l (kc->u.narrow[2 * 13 + 0]) +#define a32h (kc->u.narrow[2 * 13 + 1]) +#define a42l (kc->u.narrow[2 * 14 + 0]) +#define a42h (kc->u.narrow[2 * 14 + 1]) +#define a03l (kc->u.narrow[2 * 15 + 0]) +#define a03h (kc->u.narrow[2 * 15 + 1]) +#define a13l (kc->u.narrow[2 * 16 + 0]) +#define a13h (kc->u.narrow[2 * 16 + 1]) +#define a23l (kc->u.narrow[2 * 17 + 0]) +#define a23h (kc->u.narrow[2 * 17 + 1]) +#define a33l (kc->u.narrow[2 * 18 + 0]) +#define a33h (kc->u.narrow[2 * 18 + 1]) +#define a43l (kc->u.narrow[2 * 19 + 0]) +#define a43h (kc->u.narrow[2 * 19 + 1]) +#define a04l (kc->u.narrow[2 * 20 + 0]) +#define a04h (kc->u.narrow[2 * 20 + 1]) +#define a14l (kc->u.narrow[2 * 21 + 0]) +#define a14h (kc->u.narrow[2 * 21 + 1]) +#define a24l (kc->u.narrow[2 * 22 + 0]) +#define a24h (kc->u.narrow[2 * 22 + 1]) +#define a34l (kc->u.narrow[2 * 23 + 0]) +#define a34h (kc->u.narrow[2 * 23 + 1]) +#define a44l (kc->u.narrow[2 * 24 + 0]) +#define a44h (kc->u.narrow[2 * 24 + 1]) + +#define DECL_STATE +#define READ_STATE(state) +#define WRITE_STATE(state) + +#define INPUT_BUF(size) do { \ + size_t j; \ + for (j = 0; j < (size); j += 8) { \ + sph_u32 tl, th; \ + tl = sph_dec32le_aligned(buf + j + 0); \ + th = sph_dec32le_aligned(buf + j + 4); \ + INTERLEAVE(tl, th); \ + kc->u.narrow[(j >> 2) + 0] ^= tl; \ + kc->u.narrow[(j >> 2) + 1] ^= th; \ + } \ + } while (0) + +#define INPUT_BUF144 INPUT_BUF(144) +#define INPUT_BUF136 INPUT_BUF(136) +#define INPUT_BUF104 INPUT_BUF(104) +#define INPUT_BUF72 INPUT_BUF(72) + +#else + +#define DECL_STATE \ + sph_u32 a00l, a00h, a01l, a01h, a02l, a02h, a03l, a03h, a04l, a04h; \ + sph_u32 a10l, a10h, a11l, a11h, a12l, a12h, a13l, a13h, a14l, a14h; \ + sph_u32 a20l, a20h, a21l, a21h, a22l, a22h, a23l, a23h, a24l, a24h; \ + sph_u32 a30l, a30h, a31l, a31h, a32l, a32h, a33l, a33h, a34l, a34h; \ + sph_u32 a40l, a40h, a41l, a41h, a42l, a42h, a43l, a43h, a44l, a44h; + +#define READ_STATE(state) do { \ + a00l = (state)->u.narrow[2 * 0 + 0]; \ + a00h = (state)->u.narrow[2 * 0 + 1]; \ + a10l = (state)->u.narrow[2 * 1 + 0]; \ + a10h = (state)->u.narrow[2 * 1 + 1]; \ + a20l = (state)->u.narrow[2 * 2 + 0]; \ + a20h = (state)->u.narrow[2 * 2 + 1]; \ + a30l = (state)->u.narrow[2 * 3 + 0]; \ + a30h = (state)->u.narrow[2 * 3 + 1]; \ + a40l = (state)->u.narrow[2 * 4 + 0]; \ + a40h = (state)->u.narrow[2 * 4 + 1]; \ + a01l = (state)->u.narrow[2 * 5 + 0]; \ + a01h = (state)->u.narrow[2 * 5 + 1]; \ + a11l = (state)->u.narrow[2 * 6 + 0]; \ + a11h = (state)->u.narrow[2 * 6 + 1]; \ + a21l = (state)->u.narrow[2 * 7 + 0]; \ + a21h = (state)->u.narrow[2 * 7 + 1]; \ + a31l = (state)->u.narrow[2 * 8 + 0]; \ + a31h = (state)->u.narrow[2 * 8 + 1]; \ + a41l = (state)->u.narrow[2 * 9 + 0]; \ + a41h = (state)->u.narrow[2 * 9 + 1]; \ + a02l = (state)->u.narrow[2 * 10 + 0]; \ + a02h = (state)->u.narrow[2 * 10 + 1]; \ + a12l = (state)->u.narrow[2 * 11 + 0]; \ + a12h = (state)->u.narrow[2 * 11 + 1]; \ + a22l = (state)->u.narrow[2 * 12 + 0]; \ + a22h = (state)->u.narrow[2 * 12 + 1]; \ + a32l = (state)->u.narrow[2 * 13 + 0]; \ + a32h = (state)->u.narrow[2 * 13 + 1]; \ + a42l = (state)->u.narrow[2 * 14 + 0]; \ + a42h = (state)->u.narrow[2 * 14 + 1]; \ + a03l = (state)->u.narrow[2 * 15 + 0]; \ + a03h = (state)->u.narrow[2 * 15 + 1]; \ + a13l = (state)->u.narrow[2 * 16 + 0]; \ + a13h = (state)->u.narrow[2 * 16 + 1]; \ + a23l = (state)->u.narrow[2 * 17 + 0]; \ + a23h = (state)->u.narrow[2 * 17 + 1]; \ + a33l = (state)->u.narrow[2 * 18 + 0]; \ + a33h = (state)->u.narrow[2 * 18 + 1]; \ + a43l = (state)->u.narrow[2 * 19 + 0]; \ + a43h = (state)->u.narrow[2 * 19 + 1]; \ + a04l = (state)->u.narrow[2 * 20 + 0]; \ + a04h = (state)->u.narrow[2 * 20 + 1]; \ + a14l = (state)->u.narrow[2 * 21 + 0]; \ + a14h = (state)->u.narrow[2 * 21 + 1]; \ + a24l = (state)->u.narrow[2 * 22 + 0]; \ + a24h = (state)->u.narrow[2 * 22 + 1]; \ + a34l = (state)->u.narrow[2 * 23 + 0]; \ + a34h = (state)->u.narrow[2 * 23 + 1]; \ + a44l = (state)->u.narrow[2 * 24 + 0]; \ + a44h = (state)->u.narrow[2 * 24 + 1]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->u.narrow[2 * 0 + 0] = a00l; \ + (state)->u.narrow[2 * 0 + 1] = a00h; \ + (state)->u.narrow[2 * 1 + 0] = a10l; \ + (state)->u.narrow[2 * 1 + 1] = a10h; \ + (state)->u.narrow[2 * 2 + 0] = a20l; \ + (state)->u.narrow[2 * 2 + 1] = a20h; \ + (state)->u.narrow[2 * 3 + 0] = a30l; \ + (state)->u.narrow[2 * 3 + 1] = a30h; \ + (state)->u.narrow[2 * 4 + 0] = a40l; \ + (state)->u.narrow[2 * 4 + 1] = a40h; \ + (state)->u.narrow[2 * 5 + 0] = a01l; \ + (state)->u.narrow[2 * 5 + 1] = a01h; \ + (state)->u.narrow[2 * 6 + 0] = a11l; \ + (state)->u.narrow[2 * 6 + 1] = a11h; \ + (state)->u.narrow[2 * 7 + 0] = a21l; \ + (state)->u.narrow[2 * 7 + 1] = a21h; \ + (state)->u.narrow[2 * 8 + 0] = a31l; \ + (state)->u.narrow[2 * 8 + 1] = a31h; \ + (state)->u.narrow[2 * 9 + 0] = a41l; \ + (state)->u.narrow[2 * 9 + 1] = a41h; \ + (state)->u.narrow[2 * 10 + 0] = a02l; \ + (state)->u.narrow[2 * 10 + 1] = a02h; \ + (state)->u.narrow[2 * 11 + 0] = a12l; \ + (state)->u.narrow[2 * 11 + 1] = a12h; \ + (state)->u.narrow[2 * 12 + 0] = a22l; \ + (state)->u.narrow[2 * 12 + 1] = a22h; \ + (state)->u.narrow[2 * 13 + 0] = a32l; \ + (state)->u.narrow[2 * 13 + 1] = a32h; \ + (state)->u.narrow[2 * 14 + 0] = a42l; \ + (state)->u.narrow[2 * 14 + 1] = a42h; \ + (state)->u.narrow[2 * 15 + 0] = a03l; \ + (state)->u.narrow[2 * 15 + 1] = a03h; \ + (state)->u.narrow[2 * 16 + 0] = a13l; \ + (state)->u.narrow[2 * 16 + 1] = a13h; \ + (state)->u.narrow[2 * 17 + 0] = a23l; \ + (state)->u.narrow[2 * 17 + 1] = a23h; \ + (state)->u.narrow[2 * 18 + 0] = a33l; \ + (state)->u.narrow[2 * 18 + 1] = a33h; \ + (state)->u.narrow[2 * 19 + 0] = a43l; \ + (state)->u.narrow[2 * 19 + 1] = a43h; \ + (state)->u.narrow[2 * 20 + 0] = a04l; \ + (state)->u.narrow[2 * 20 + 1] = a04h; \ + (state)->u.narrow[2 * 21 + 0] = a14l; \ + (state)->u.narrow[2 * 21 + 1] = a14h; \ + (state)->u.narrow[2 * 22 + 0] = a24l; \ + (state)->u.narrow[2 * 22 + 1] = a24h; \ + (state)->u.narrow[2 * 23 + 0] = a34l; \ + (state)->u.narrow[2 * 23 + 1] = a34h; \ + (state)->u.narrow[2 * 24 + 0] = a44l; \ + (state)->u.narrow[2 * 24 + 1] = a44h; \ + } while (0) + +#define READ64(d, off) do { \ + sph_u32 tl, th; \ + tl = sph_dec32le_aligned(buf + (off)); \ + th = sph_dec32le_aligned(buf + (off) + 4); \ + INTERLEAVE(tl, th); \ + d ## l ^= tl; \ + d ## h ^= th; \ + } while (0) + +#define INPUT_BUF144 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + READ64(a32, 104); \ + READ64(a42, 112); \ + READ64(a03, 120); \ + READ64(a13, 128); \ + READ64(a23, 136); \ + } while (0) + +#define INPUT_BUF136 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + READ64(a32, 104); \ + READ64(a42, 112); \ + READ64(a03, 120); \ + READ64(a13, 128); \ + } while (0) + +#define INPUT_BUF104 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + } while (0) + +#define INPUT_BUF72 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + } while (0) + +#define INPUT_BUF(lim) do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + if ((lim) == 72) \ + break; \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + if ((lim) == 104) \ + break; \ + READ64(a32, 104); \ + READ64(a42, 112); \ + READ64(a03, 120); \ + READ64(a13, 128); \ + if ((lim) == 136) \ + break; \ + READ64(a23, 136); \ + } while (0) + +#endif + +#define DECL64(x) sph_u64 x ## l, x ## h +#define MOV64(d, s) (d ## l = s ## l, d ## h = s ## h) +#define XOR64(d, a, b) (d ## l = a ## l ^ b ## l, d ## h = a ## h ^ b ## h) +#define AND64(d, a, b) (d ## l = a ## l & b ## l, d ## h = a ## h & b ## h) +#define OR64(d, a, b) (d ## l = a ## l | b ## l, d ## h = a ## h | b ## h) +#define NOT64(d, s) (d ## l = SPH_T32(~s ## l), d ## h = SPH_T32(~s ## h)) +#define ROL64(d, v, n) ROL64_ ## n(d, v) + +#if SPH_KECCAK_INTERLEAVE + +#define ROL64_odd1(d, v) do { \ + sph_u32 tmp; \ + tmp = v ## l; \ + d ## l = SPH_T32(v ## h << 1) | (v ## h >> 31); \ + d ## h = tmp; \ + } while (0) + +#define ROL64_odd63(d, v) do { \ + sph_u32 tmp; \ + tmp = SPH_T32(v ## l << 31) | (v ## l >> 1); \ + d ## l = v ## h; \ + d ## h = tmp; \ + } while (0) + +#define ROL64_odd(d, v, n) do { \ + sph_u32 tmp; \ + tmp = SPH_T32(v ## l << (n - 1)) | (v ## l >> (33 - n)); \ + d ## l = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \ + d ## h = tmp; \ + } while (0) + +#define ROL64_even(d, v, n) do { \ + d ## l = SPH_T32(v ## l << n) | (v ## l >> (32 - n)); \ + d ## h = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \ + } while (0) + +#define ROL64_0(d, v) +#define ROL64_1(d, v) ROL64_odd1(d, v) +#define ROL64_2(d, v) ROL64_even(d, v, 1) +#define ROL64_3(d, v) ROL64_odd( d, v, 2) +#define ROL64_4(d, v) ROL64_even(d, v, 2) +#define ROL64_5(d, v) ROL64_odd( d, v, 3) +#define ROL64_6(d, v) ROL64_even(d, v, 3) +#define ROL64_7(d, v) ROL64_odd( d, v, 4) +#define ROL64_8(d, v) ROL64_even(d, v, 4) +#define ROL64_9(d, v) ROL64_odd( d, v, 5) +#define ROL64_10(d, v) ROL64_even(d, v, 5) +#define ROL64_11(d, v) ROL64_odd( d, v, 6) +#define ROL64_12(d, v) ROL64_even(d, v, 6) +#define ROL64_13(d, v) ROL64_odd( d, v, 7) +#define ROL64_14(d, v) ROL64_even(d, v, 7) +#define ROL64_15(d, v) ROL64_odd( d, v, 8) +#define ROL64_16(d, v) ROL64_even(d, v, 8) +#define ROL64_17(d, v) ROL64_odd( d, v, 9) +#define ROL64_18(d, v) ROL64_even(d, v, 9) +#define ROL64_19(d, v) ROL64_odd( d, v, 10) +#define ROL64_20(d, v) ROL64_even(d, v, 10) +#define ROL64_21(d, v) ROL64_odd( d, v, 11) +#define ROL64_22(d, v) ROL64_even(d, v, 11) +#define ROL64_23(d, v) ROL64_odd( d, v, 12) +#define ROL64_24(d, v) ROL64_even(d, v, 12) +#define ROL64_25(d, v) ROL64_odd( d, v, 13) +#define ROL64_26(d, v) ROL64_even(d, v, 13) +#define ROL64_27(d, v) ROL64_odd( d, v, 14) +#define ROL64_28(d, v) ROL64_even(d, v, 14) +#define ROL64_29(d, v) ROL64_odd( d, v, 15) +#define ROL64_30(d, v) ROL64_even(d, v, 15) +#define ROL64_31(d, v) ROL64_odd( d, v, 16) +#define ROL64_32(d, v) ROL64_even(d, v, 16) +#define ROL64_33(d, v) ROL64_odd( d, v, 17) +#define ROL64_34(d, v) ROL64_even(d, v, 17) +#define ROL64_35(d, v) ROL64_odd( d, v, 18) +#define ROL64_36(d, v) ROL64_even(d, v, 18) +#define ROL64_37(d, v) ROL64_odd( d, v, 19) +#define ROL64_38(d, v) ROL64_even(d, v, 19) +#define ROL64_39(d, v) ROL64_odd( d, v, 20) +#define ROL64_40(d, v) ROL64_even(d, v, 20) +#define ROL64_41(d, v) ROL64_odd( d, v, 21) +#define ROL64_42(d, v) ROL64_even(d, v, 21) +#define ROL64_43(d, v) ROL64_odd( d, v, 22) +#define ROL64_44(d, v) ROL64_even(d, v, 22) +#define ROL64_45(d, v) ROL64_odd( d, v, 23) +#define ROL64_46(d, v) ROL64_even(d, v, 23) +#define ROL64_47(d, v) ROL64_odd( d, v, 24) +#define ROL64_48(d, v) ROL64_even(d, v, 24) +#define ROL64_49(d, v) ROL64_odd( d, v, 25) +#define ROL64_50(d, v) ROL64_even(d, v, 25) +#define ROL64_51(d, v) ROL64_odd( d, v, 26) +#define ROL64_52(d, v) ROL64_even(d, v, 26) +#define ROL64_53(d, v) ROL64_odd( d, v, 27) +#define ROL64_54(d, v) ROL64_even(d, v, 27) +#define ROL64_55(d, v) ROL64_odd( d, v, 28) +#define ROL64_56(d, v) ROL64_even(d, v, 28) +#define ROL64_57(d, v) ROL64_odd( d, v, 29) +#define ROL64_58(d, v) ROL64_even(d, v, 29) +#define ROL64_59(d, v) ROL64_odd( d, v, 30) +#define ROL64_60(d, v) ROL64_even(d, v, 30) +#define ROL64_61(d, v) ROL64_odd( d, v, 31) +#define ROL64_62(d, v) ROL64_even(d, v, 31) +#define ROL64_63(d, v) ROL64_odd63(d, v) + +#else + +#define ROL64_small(d, v, n) do { \ + sph_u32 tmp; \ + tmp = SPH_T32(v ## l << n) | (v ## h >> (32 - n)); \ + d ## h = SPH_T32(v ## h << n) | (v ## l >> (32 - n)); \ + d ## l = tmp; \ + } while (0) + +#define ROL64_0(d, v) 0 +#define ROL64_1(d, v) ROL64_small(d, v, 1) +#define ROL64_2(d, v) ROL64_small(d, v, 2) +#define ROL64_3(d, v) ROL64_small(d, v, 3) +#define ROL64_4(d, v) ROL64_small(d, v, 4) +#define ROL64_5(d, v) ROL64_small(d, v, 5) +#define ROL64_6(d, v) ROL64_small(d, v, 6) +#define ROL64_7(d, v) ROL64_small(d, v, 7) +#define ROL64_8(d, v) ROL64_small(d, v, 8) +#define ROL64_9(d, v) ROL64_small(d, v, 9) +#define ROL64_10(d, v) ROL64_small(d, v, 10) +#define ROL64_11(d, v) ROL64_small(d, v, 11) +#define ROL64_12(d, v) ROL64_small(d, v, 12) +#define ROL64_13(d, v) ROL64_small(d, v, 13) +#define ROL64_14(d, v) ROL64_small(d, v, 14) +#define ROL64_15(d, v) ROL64_small(d, v, 15) +#define ROL64_16(d, v) ROL64_small(d, v, 16) +#define ROL64_17(d, v) ROL64_small(d, v, 17) +#define ROL64_18(d, v) ROL64_small(d, v, 18) +#define ROL64_19(d, v) ROL64_small(d, v, 19) +#define ROL64_20(d, v) ROL64_small(d, v, 20) +#define ROL64_21(d, v) ROL64_small(d, v, 21) +#define ROL64_22(d, v) ROL64_small(d, v, 22) +#define ROL64_23(d, v) ROL64_small(d, v, 23) +#define ROL64_24(d, v) ROL64_small(d, v, 24) +#define ROL64_25(d, v) ROL64_small(d, v, 25) +#define ROL64_26(d, v) ROL64_small(d, v, 26) +#define ROL64_27(d, v) ROL64_small(d, v, 27) +#define ROL64_28(d, v) ROL64_small(d, v, 28) +#define ROL64_29(d, v) ROL64_small(d, v, 29) +#define ROL64_30(d, v) ROL64_small(d, v, 30) +#define ROL64_31(d, v) ROL64_small(d, v, 31) + +#define ROL64_32(d, v) do { \ + sph_u32 tmp; \ + tmp = v ## l; \ + d ## l = v ## h; \ + d ## h = tmp; \ + } while (0) + +#define ROL64_big(d, v, n) do { \ + sph_u32 trl, trh; \ + ROL64_small(tr, v, n); \ + d ## h = trl; \ + d ## l = trh; \ + } while (0) + +#define ROL64_33(d, v) ROL64_big(d, v, 1) +#define ROL64_34(d, v) ROL64_big(d, v, 2) +#define ROL64_35(d, v) ROL64_big(d, v, 3) +#define ROL64_36(d, v) ROL64_big(d, v, 4) +#define ROL64_37(d, v) ROL64_big(d, v, 5) +#define ROL64_38(d, v) ROL64_big(d, v, 6) +#define ROL64_39(d, v) ROL64_big(d, v, 7) +#define ROL64_40(d, v) ROL64_big(d, v, 8) +#define ROL64_41(d, v) ROL64_big(d, v, 9) +#define ROL64_42(d, v) ROL64_big(d, v, 10) +#define ROL64_43(d, v) ROL64_big(d, v, 11) +#define ROL64_44(d, v) ROL64_big(d, v, 12) +#define ROL64_45(d, v) ROL64_big(d, v, 13) +#define ROL64_46(d, v) ROL64_big(d, v, 14) +#define ROL64_47(d, v) ROL64_big(d, v, 15) +#define ROL64_48(d, v) ROL64_big(d, v, 16) +#define ROL64_49(d, v) ROL64_big(d, v, 17) +#define ROL64_50(d, v) ROL64_big(d, v, 18) +#define ROL64_51(d, v) ROL64_big(d, v, 19) +#define ROL64_52(d, v) ROL64_big(d, v, 20) +#define ROL64_53(d, v) ROL64_big(d, v, 21) +#define ROL64_54(d, v) ROL64_big(d, v, 22) +#define ROL64_55(d, v) ROL64_big(d, v, 23) +#define ROL64_56(d, v) ROL64_big(d, v, 24) +#define ROL64_57(d, v) ROL64_big(d, v, 25) +#define ROL64_58(d, v) ROL64_big(d, v, 26) +#define ROL64_59(d, v) ROL64_big(d, v, 27) +#define ROL64_60(d, v) ROL64_big(d, v, 28) +#define ROL64_61(d, v) ROL64_big(d, v, 29) +#define ROL64_62(d, v) ROL64_big(d, v, 30) +#define ROL64_63(d, v) ROL64_big(d, v, 31) + +#endif + +#define XOR64_IOTA(d, s, k) \ + (d ## l = s ## l ^ k.low, d ## h = s ## h ^ k.high) + +#endif + +#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ + DECL64(tt0); \ + DECL64(tt1); \ + DECL64(tt2); \ + DECL64(tt3); \ + XOR64(tt0, d0, d1); \ + XOR64(tt1, d2, d3); \ + XOR64(tt0, tt0, d4); \ + XOR64(tt0, tt0, tt1); \ + ROL64(tt0, tt0, 1); \ + XOR64(tt2, c0, c1); \ + XOR64(tt3, c2, c3); \ + XOR64(tt0, tt0, c4); \ + XOR64(tt2, tt2, tt3); \ + XOR64(t, tt0, tt2); \ + } while (0) + +#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + DECL64(t0); \ + DECL64(t1); \ + DECL64(t2); \ + DECL64(t3); \ + DECL64(t4); \ + TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \ + TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \ + TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \ + TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \ + TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \ + XOR64(b00, b00, t0); \ + XOR64(b01, b01, t0); \ + XOR64(b02, b02, t0); \ + XOR64(b03, b03, t0); \ + XOR64(b04, b04, t0); \ + XOR64(b10, b10, t1); \ + XOR64(b11, b11, t1); \ + XOR64(b12, b12, t1); \ + XOR64(b13, b13, t1); \ + XOR64(b14, b14, t1); \ + XOR64(b20, b20, t2); \ + XOR64(b21, b21, t2); \ + XOR64(b22, b22, t2); \ + XOR64(b23, b23, t2); \ + XOR64(b24, b24, t2); \ + XOR64(b30, b30, t3); \ + XOR64(b31, b31, t3); \ + XOR64(b32, b32, t3); \ + XOR64(b33, b33, t3); \ + XOR64(b34, b34, t3); \ + XOR64(b40, b40, t4); \ + XOR64(b41, b41, t4); \ + XOR64(b42, b42, t4); \ + XOR64(b43, b43, t4); \ + XOR64(b44, b44, t4); \ + } while (0) + +#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + /* ROL64(b00, b00, 0); */ \ + ROL64(b01, b01, 36); \ + ROL64(b02, b02, 3); \ + ROL64(b03, b03, 41); \ + ROL64(b04, b04, 18); \ + ROL64(b10, b10, 1); \ + ROL64(b11, b11, 44); \ + ROL64(b12, b12, 10); \ + ROL64(b13, b13, 45); \ + ROL64(b14, b14, 2); \ + ROL64(b20, b20, 62); \ + ROL64(b21, b21, 6); \ + ROL64(b22, b22, 43); \ + ROL64(b23, b23, 15); \ + ROL64(b24, b24, 61); \ + ROL64(b30, b30, 28); \ + ROL64(b31, b31, 55); \ + ROL64(b32, b32, 25); \ + ROL64(b33, b33, 21); \ + ROL64(b34, b34, 56); \ + ROL64(b40, b40, 27); \ + ROL64(b41, b41, 20); \ + ROL64(b42, b42, 39); \ + ROL64(b43, b43, 8); \ + ROL64(b44, b44, 14); \ + } while (0) + +/* + * The KHI macro integrates the "lane complement" optimization. On input, + * some words are complemented: + * a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43 + * On output, the following words are complemented: + * a04 a10 a20 a22 a23 a31 + * + * The (implicit) permutation and the theta expansion will bring back + * the input mask for the next round. + */ + +#define KHI_XO(d, a, b, c) do { \ + DECL64(kt); \ + OR64(kt, b, c); \ + XOR64(d, a, kt); \ + } while (0) + +#define KHI_XA(d, a, b, c) do { \ + DECL64(kt); \ + AND64(kt, b, c); \ + XOR64(d, a, kt); \ + } while (0) + +#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + DECL64(c0); \ + DECL64(c1); \ + DECL64(c2); \ + DECL64(c3); \ + DECL64(c4); \ + DECL64(bnn); \ + NOT64(bnn, b20); \ + KHI_XO(c0, b00, b10, b20); \ + KHI_XO(c1, b10, bnn, b30); \ + KHI_XA(c2, b20, b30, b40); \ + KHI_XO(c3, b30, b40, b00); \ + KHI_XA(c4, b40, b00, b10); \ + MOV64(b00, c0); \ + MOV64(b10, c1); \ + MOV64(b20, c2); \ + MOV64(b30, c3); \ + MOV64(b40, c4); \ + NOT64(bnn, b41); \ + KHI_XO(c0, b01, b11, b21); \ + KHI_XA(c1, b11, b21, b31); \ + KHI_XO(c2, b21, b31, bnn); \ + KHI_XO(c3, b31, b41, b01); \ + KHI_XA(c4, b41, b01, b11); \ + MOV64(b01, c0); \ + MOV64(b11, c1); \ + MOV64(b21, c2); \ + MOV64(b31, c3); \ + MOV64(b41, c4); \ + NOT64(bnn, b32); \ + KHI_XO(c0, b02, b12, b22); \ + KHI_XA(c1, b12, b22, b32); \ + KHI_XA(c2, b22, bnn, b42); \ + KHI_XO(c3, bnn, b42, b02); \ + KHI_XA(c4, b42, b02, b12); \ + MOV64(b02, c0); \ + MOV64(b12, c1); \ + MOV64(b22, c2); \ + MOV64(b32, c3); \ + MOV64(b42, c4); \ + NOT64(bnn, b33); \ + KHI_XA(c0, b03, b13, b23); \ + KHI_XO(c1, b13, b23, b33); \ + KHI_XO(c2, b23, bnn, b43); \ + KHI_XA(c3, bnn, b43, b03); \ + KHI_XO(c4, b43, b03, b13); \ + MOV64(b03, c0); \ + MOV64(b13, c1); \ + MOV64(b23, c2); \ + MOV64(b33, c3); \ + MOV64(b43, c4); \ + NOT64(bnn, b14); \ + KHI_XA(c0, b04, bnn, b24); \ + KHI_XO(c1, bnn, b24, b34); \ + KHI_XA(c2, b24, b34, b44); \ + KHI_XO(c3, b34, b44, b04); \ + KHI_XA(c4, b44, b04, b14); \ + MOV64(b04, c0); \ + MOV64(b14, c1); \ + MOV64(b24, c2); \ + MOV64(b34, c3); \ + MOV64(b44, c4); \ + } while (0) + +#define IOTA(r) XOR64_IOTA(a00, a00, r) + +#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \ + a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44 +#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \ + a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14 +#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \ + a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31 +#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \ + a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13 +#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \ + a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01 +#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \ + a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30 +#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \ + a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33 +#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \ + a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23 +#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \ + a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12 +#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \ + a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21 +#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \ + a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02 +#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \ + a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10 +#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \ + a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11 +#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \ + a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41 +#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \ + a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24 +#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \ + a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42 +#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \ + a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04 +#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \ + a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20 +#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \ + a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22 +#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \ + a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32 +#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \ + a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43 +#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \ + a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34 +#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \ + a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03 +#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \ + a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40 + +#define P1_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a30); \ + MOV64(a30, a33); \ + MOV64(a33, a23); \ + MOV64(a23, a12); \ + MOV64(a12, a21); \ + MOV64(a21, a02); \ + MOV64(a02, a10); \ + MOV64(a10, a11); \ + MOV64(a11, a41); \ + MOV64(a41, a24); \ + MOV64(a24, a42); \ + MOV64(a42, a04); \ + MOV64(a04, a20); \ + MOV64(a20, a22); \ + MOV64(a22, a32); \ + MOV64(a32, a43); \ + MOV64(a43, a34); \ + MOV64(a34, a03); \ + MOV64(a03, a40); \ + MOV64(a40, a44); \ + MOV64(a44, a14); \ + MOV64(a14, a31); \ + MOV64(a31, a13); \ + MOV64(a13, t); \ + } while (0) + +#define P2_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a33); \ + MOV64(a33, a12); \ + MOV64(a12, a02); \ + MOV64(a02, a11); \ + MOV64(a11, a24); \ + MOV64(a24, a04); \ + MOV64(a04, a22); \ + MOV64(a22, a43); \ + MOV64(a43, a03); \ + MOV64(a03, a44); \ + MOV64(a44, a31); \ + MOV64(a31, t); \ + MOV64(t, a10); \ + MOV64(a10, a41); \ + MOV64(a41, a42); \ + MOV64(a42, a20); \ + MOV64(a20, a32); \ + MOV64(a32, a34); \ + MOV64(a34, a40); \ + MOV64(a40, a14); \ + MOV64(a14, a13); \ + MOV64(a13, a30); \ + MOV64(a30, a23); \ + MOV64(a23, a21); \ + MOV64(a21, t); \ + } while (0) + +#define P4_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a12); \ + MOV64(a12, a11); \ + MOV64(a11, a04); \ + MOV64(a04, a43); \ + MOV64(a43, a44); \ + MOV64(a44, t); \ + MOV64(t, a02); \ + MOV64(a02, a24); \ + MOV64(a24, a22); \ + MOV64(a22, a03); \ + MOV64(a03, a31); \ + MOV64(a31, a33); \ + MOV64(a33, t); \ + MOV64(t, a10); \ + MOV64(a10, a42); \ + MOV64(a42, a32); \ + MOV64(a32, a40); \ + MOV64(a40, a13); \ + MOV64(a13, a23); \ + MOV64(a23, t); \ + MOV64(t, a14); \ + MOV64(a14, a30); \ + MOV64(a30, a21); \ + MOV64(a21, a41); \ + MOV64(a41, a20); \ + MOV64(a20, a34); \ + MOV64(a34, t); \ + } while (0) + +#define P6_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a02); \ + MOV64(a02, a04); \ + MOV64(a04, a03); \ + MOV64(a03, t); \ + MOV64(t, a10); \ + MOV64(a10, a20); \ + MOV64(a20, a40); \ + MOV64(a40, a30); \ + MOV64(a30, t); \ + MOV64(t, a11); \ + MOV64(a11, a22); \ + MOV64(a22, a44); \ + MOV64(a44, a33); \ + MOV64(a33, t); \ + MOV64(t, a12); \ + MOV64(a12, a24); \ + MOV64(a24, a43); \ + MOV64(a43, a31); \ + MOV64(a31, t); \ + MOV64(t, a13); \ + MOV64(a13, a21); \ + MOV64(a21, a42); \ + MOV64(a42, a34); \ + MOV64(a34, t); \ + MOV64(t, a14); \ + MOV64(a14, a23); \ + MOV64(a23, a41); \ + MOV64(a41, a32); \ + MOV64(a32, t); \ + } while (0) + +#define P8_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a11); \ + MOV64(a11, a43); \ + MOV64(a43, t); \ + MOV64(t, a02); \ + MOV64(a02, a22); \ + MOV64(a22, a31); \ + MOV64(a31, t); \ + MOV64(t, a03); \ + MOV64(a03, a33); \ + MOV64(a33, a24); \ + MOV64(a24, t); \ + MOV64(t, a04); \ + MOV64(a04, a44); \ + MOV64(a44, a12); \ + MOV64(a12, t); \ + MOV64(t, a10); \ + MOV64(a10, a32); \ + MOV64(a32, a13); \ + MOV64(a13, t); \ + MOV64(t, a14); \ + MOV64(a14, a21); \ + MOV64(a21, a20); \ + MOV64(a20, t); \ + MOV64(t, a23); \ + MOV64(a23, a42); \ + MOV64(a42, a40); \ + MOV64(a40, t); \ + MOV64(t, a30); \ + MOV64(a30, a41); \ + MOV64(a41, a34); \ + MOV64(a34, t); \ + } while (0) + +#define P12_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a04); \ + MOV64(a04, t); \ + MOV64(t, a02); \ + MOV64(a02, a03); \ + MOV64(a03, t); \ + MOV64(t, a10); \ + MOV64(a10, a40); \ + MOV64(a40, t); \ + MOV64(t, a11); \ + MOV64(a11, a44); \ + MOV64(a44, t); \ + MOV64(t, a12); \ + MOV64(a12, a43); \ + MOV64(a43, t); \ + MOV64(t, a13); \ + MOV64(a13, a42); \ + MOV64(a42, t); \ + MOV64(t, a14); \ + MOV64(a14, a41); \ + MOV64(a41, t); \ + MOV64(t, a20); \ + MOV64(a20, a30); \ + MOV64(a30, t); \ + MOV64(t, a21); \ + MOV64(a21, a34); \ + MOV64(a34, t); \ + MOV64(t, a22); \ + MOV64(a22, a33); \ + MOV64(a33, t); \ + MOV64(t, a23); \ + MOV64(a23, a32); \ + MOV64(a32, t); \ + MOV64(t, a24); \ + MOV64(a24, a31); \ + MOV64(a31, t); \ + } while (0) + +#define LPAR ( +#define RPAR ) + +#define KF_ELT(r, s, k) do { \ + THETA LPAR P ## r RPAR; \ + RHO LPAR P ## r RPAR; \ + KHI LPAR P ## s RPAR; \ + IOTA(k); \ + } while (0) + +#define DO(x) x + +#define KECCAK_F_1600 DO(KECCAK_F_1600_) + +#if SPH_KECCAK_UNROLL == 1 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j ++) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + P1_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 2 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 2) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + P2_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 4 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 4) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + P4_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 6 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 6) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + KF_ELT( 4, 5, RC[j + 4]); \ + KF_ELT( 5, 6, RC[j + 5]); \ + P6_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 8 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 8) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + KF_ELT( 4, 5, RC[j + 4]); \ + KF_ELT( 5, 6, RC[j + 5]); \ + KF_ELT( 6, 7, RC[j + 6]); \ + KF_ELT( 7, 8, RC[j + 7]); \ + P8_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 12 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 12) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + KF_ELT( 4, 5, RC[j + 4]); \ + KF_ELT( 5, 6, RC[j + 5]); \ + KF_ELT( 6, 7, RC[j + 6]); \ + KF_ELT( 7, 8, RC[j + 7]); \ + KF_ELT( 8, 9, RC[j + 8]); \ + KF_ELT( 9, 10, RC[j + 9]); \ + KF_ELT(10, 11, RC[j + 10]); \ + KF_ELT(11, 12, RC[j + 11]); \ + P12_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 0 + +#define KECCAK_F_1600_ do { \ + KF_ELT( 0, 1, RC[ 0]); \ + KF_ELT( 1, 2, RC[ 1]); \ + KF_ELT( 2, 3, RC[ 2]); \ + KF_ELT( 3, 4, RC[ 3]); \ + KF_ELT( 4, 5, RC[ 4]); \ + KF_ELT( 5, 6, RC[ 5]); \ + KF_ELT( 6, 7, RC[ 6]); \ + KF_ELT( 7, 8, RC[ 7]); \ + KF_ELT( 8, 9, RC[ 8]); \ + KF_ELT( 9, 10, RC[ 9]); \ + KF_ELT(10, 11, RC[10]); \ + KF_ELT(11, 12, RC[11]); \ + KF_ELT(12, 13, RC[12]); \ + KF_ELT(13, 14, RC[13]); \ + KF_ELT(14, 15, RC[14]); \ + KF_ELT(15, 16, RC[15]); \ + KF_ELT(16, 17, RC[16]); \ + KF_ELT(17, 18, RC[17]); \ + KF_ELT(18, 19, RC[18]); \ + KF_ELT(19, 20, RC[19]); \ + KF_ELT(20, 21, RC[20]); \ + KF_ELT(21, 22, RC[21]); \ + KF_ELT(22, 23, RC[22]); \ + KF_ELT(23, 0, RC[23]); \ + } while (0) + +#else + +#error Unimplemented unroll count for Keccak. + +#endif + +static void +keccak_init(sph_keccak_context *kc, unsigned out_size) +{ + int i; + +#if SPH_KECCAK_64 + for (i = 0; i < 25; i ++) + kc->u.wide[i] = 0; + /* + * Initialization for the "lane complement". + */ + kc->u.wide[ 1] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[ 2] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[ 8] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[12] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[17] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[20] = SPH_C64(0xFFFFFFFFFFFFFFFF); +#else + + for (i = 0; i < 50; i ++) + kc->u.narrow[i] = 0; + /* + * Initialization for the "lane complement". + * Note: since we set to all-one full 64-bit words, + * interleaving (if applicable) is a no-op. + */ + kc->u.narrow[ 2] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[ 3] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[ 4] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[ 5] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[16] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[17] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[24] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[25] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[34] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[35] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[40] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[41] = SPH_C32(0xFFFFFFFF); +#endif + kc->ptr = 0; + kc->lim = 200 - (out_size >> 2); +} + +static void +keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE + + buf = kc->buf; + ptr = kc->ptr; + + if (len < (lim - ptr)) { + memcpy(buf + ptr, data, len); + kc->ptr = ptr + len; + return; + } + + READ_STATE(kc); + while (len > 0) { + size_t clen; + + clen = (lim - ptr); + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == lim) { + INPUT_BUF(lim); + KECCAK_F_1600; + ptr = 0; + } + } + WRITE_STATE(kc); + kc->ptr = ptr; +} + +#if SPH_KECCAK_64 + +#define DEFCLOSE(d, lim) \ + static void keccak_close ## d( \ + sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \ + { \ + unsigned eb; \ + union { \ + unsigned char tmp[lim + 1]; \ + sph_u64 dummy; /* for alignment */ \ + } u; \ + size_t j; \ + \ + eb = (0x100 | (ub & 0xFF)) >> (8 - n); \ + if (kc->ptr == (lim - 1)) { \ + if (n == 7) { \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, lim - 1); \ + u.tmp[lim] = 0x80; \ + j = 1 + lim; \ + } else { \ + u.tmp[0] = eb | 0x80; \ + j = 1; \ + } \ + } else { \ + j = lim - kc->ptr; \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, j - 2); \ + u.tmp[j - 1] = 0x80; \ + } \ + keccak_core(kc, u.tmp, j, lim); \ + /* Finalize the "lane complement" */ \ + kc->u.wide[ 1] = ~kc->u.wide[ 1]; \ + kc->u.wide[ 2] = ~kc->u.wide[ 2]; \ + kc->u.wide[ 8] = ~kc->u.wide[ 8]; \ + kc->u.wide[12] = ~kc->u.wide[12]; \ + kc->u.wide[17] = ~kc->u.wide[17]; \ + kc->u.wide[20] = ~kc->u.wide[20]; \ + for (j = 0; j < d; j += 8) \ + sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \ + memcpy(dst, u.tmp, d); \ + keccak_init(kc, (unsigned)d << 3); \ + } \ + +#else + +#define DEFCLOSE(d, lim) \ + static void keccak_close ## d( \ + sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \ + { \ + unsigned eb; \ + union { \ + unsigned char tmp[lim + 1]; \ + sph_u64 dummy; /* for alignment */ \ + } u; \ + size_t j; \ + \ + eb = (0x100 | (ub & 0xFF)) >> (8 - n); \ + if (kc->ptr == (lim - 1)) { \ + if (n == 7) { \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, lim - 1); \ + u.tmp[lim] = 0x80; \ + j = 1 + lim; \ + } else { \ + u.tmp[0] = eb | 0x80; \ + j = 1; \ + } \ + } else { \ + j = lim - kc->ptr; \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, j - 2); \ + u.tmp[j - 1] = 0x80; \ + } \ + keccak_core(kc, u.tmp, j, lim); \ + /* Finalize the "lane complement" */ \ + kc->u.narrow[ 2] = ~kc->u.narrow[ 2]; \ + kc->u.narrow[ 3] = ~kc->u.narrow[ 3]; \ + kc->u.narrow[ 4] = ~kc->u.narrow[ 4]; \ + kc->u.narrow[ 5] = ~kc->u.narrow[ 5]; \ + kc->u.narrow[16] = ~kc->u.narrow[16]; \ + kc->u.narrow[17] = ~kc->u.narrow[17]; \ + kc->u.narrow[24] = ~kc->u.narrow[24]; \ + kc->u.narrow[25] = ~kc->u.narrow[25]; \ + kc->u.narrow[34] = ~kc->u.narrow[34]; \ + kc->u.narrow[35] = ~kc->u.narrow[35]; \ + kc->u.narrow[40] = ~kc->u.narrow[40]; \ + kc->u.narrow[41] = ~kc->u.narrow[41]; \ + /* un-interleave */ \ + for (j = 0; j < 50; j += 2) \ + UNINTERLEAVE(kc->u.narrow[j], kc->u.narrow[j + 1]); \ + for (j = 0; j < d; j += 4) \ + sph_enc32le_aligned(u.tmp + j, kc->u.narrow[j >> 2]); \ + memcpy(dst, u.tmp, d); \ + keccak_init(kc, (unsigned)d << 3); \ + } \ + +#endif + +DEFCLOSE(28, 144) +DEFCLOSE(32, 136) +DEFCLOSE(48, 104) +DEFCLOSE(64, 72) + +/* see sph_keccak.h */ +void +sph_keccak224_init(void *cc) +{ + keccak_init(cc, 224); +} + +/* see sph_keccak.h */ +void +sph_keccak224(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 144); +} + +/* see sph_keccak.h */ +void +sph_keccak224_close(void *cc, void *dst) +{ + sph_keccak224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close28(cc, ub, n, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak256_init(void *cc) +{ + keccak_init(cc, 256); +} + +/* see sph_keccak.h */ +void +sph_keccak256(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 136); +} + +/* see sph_keccak.h */ +void +sph_keccak256_close(void *cc, void *dst) +{ + sph_keccak256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close32(cc, ub, n, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak384_init(void *cc) +{ + keccak_init(cc, 384); +} + +/* see sph_keccak.h */ +void +sph_keccak384(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 104); +} + +/* see sph_keccak.h */ +void +sph_keccak384_close(void *cc, void *dst) +{ + sph_keccak384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close48(cc, ub, n, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak512_init(void *cc) +{ + keccak_init(cc, 512); +} + +/* see sph_keccak.h */ +void +sph_keccak512(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 72); +} + +/* see sph_keccak.h */ +void +sph_keccak512_close(void *cc, void *dst) +{ + sph_keccak512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_keccak512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close64(cc, ub, n, dst); +} + + +#ifdef __cplusplus +} +#endif diff --git a/sha3/sph_keccak.h b/sha3/sph_keccak.h new file mode 100644 index 00000000..bdafdb88 --- /dev/null +++ b/sha3/sph_keccak.h @@ -0,0 +1,293 @@ +/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * Keccak interface. This is the interface for Keccak with the + * recommended parameters for SHA-3, with output lengths 224, 256, + * 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_keccak.h + * @author Thomas Pornin + */ + +#ifndef SPH_KECCAK_H__ +#define SPH_KECCAK_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for Keccak-224. + */ +#define SPH_SIZE_keccak224 224 + +/** + * Output size (in bits) for Keccak-256. + */ +#define SPH_SIZE_keccak256 256 + +/** + * Output size (in bits) for Keccak-384. + */ +#define SPH_SIZE_keccak384 384 + +/** + * Output size (in bits) for Keccak-512. + */ +#define SPH_SIZE_keccak512 512 + +/** + * This structure is a context for Keccak computations: it contains the + * intermediate values and some data from the last entered block. Once a + * Keccak computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running Keccak computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[144]; /* first field, for alignment */ + size_t ptr, lim; + union { +#if SPH_64 + sph_u64 wide[25]; +#endif + sph_u32 narrow[50]; + } u; +#endif +} sph_keccak_context; + +/** + * Type for a Keccak-224 context (identical to the common context). + */ +typedef sph_keccak_context sph_keccak224_context; + +/** + * Type for a Keccak-256 context (identical to the common context). + */ +typedef sph_keccak_context sph_keccak256_context; + +/** + * Type for a Keccak-384 context (identical to the common context). + */ +typedef sph_keccak_context sph_keccak384_context; + +/** + * Type for a Keccak-512 context (identical to the common context). + */ +typedef sph_keccak_context sph_keccak512_context; + +/** + * Initialize a Keccak-224 context. This process performs no memory allocation. + * + * @param cc the Keccak-224 context (pointer to a + * sph_keccak224_context) + */ +void sph_keccak224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_keccak224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-224 context + * @param dst the destination buffer + */ +void sph_keccak224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_keccak224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Keccak-256 context. This process performs no memory allocation. + * + * @param cc the Keccak-256 context (pointer to a + * sph_keccak256_context) + */ +void sph_keccak256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_keccak256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-256 context + * @param dst the destination buffer + */ +void sph_keccak256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_keccak256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Keccak-384 context. This process performs no memory allocation. + * + * @param cc the Keccak-384 context (pointer to a + * sph_keccak384_context) + */ +void sph_keccak384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_keccak384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-384 context + * @param dst the destination buffer + */ +void sph_keccak384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_keccak384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Keccak-512 context. This process performs no memory allocation. + * + * @param cc the Keccak-512 context (pointer to a + * sph_keccak512_context) + */ +void sph_keccak512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_keccak512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-512 context + * @param dst the destination buffer + */ +void sph_keccak512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_keccak512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_luffa.c b/sha3/sph_luffa.c new file mode 100644 index 00000000..a761bea0 --- /dev/null +++ b/sha3/sph_luffa.c @@ -0,0 +1,1426 @@ +/* $Id: luffa.c 219 2010-06-08 17:24:41Z tp $ */ +/* + * Luffa implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_luffa.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_64_TRUE && !defined SPH_LUFFA_PARALLEL +#define SPH_LUFFA_PARALLEL 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +static const sph_u32 V_INIT[5][8] = { + { + SPH_C32(0x6d251e69), SPH_C32(0x44b051e0), + SPH_C32(0x4eaa6fb4), SPH_C32(0xdbf78465), + SPH_C32(0x6e292011), SPH_C32(0x90152df4), + SPH_C32(0xee058139), SPH_C32(0xdef610bb) + }, { + SPH_C32(0xc3b44b95), SPH_C32(0xd9d2f256), + SPH_C32(0x70eee9a0), SPH_C32(0xde099fa3), + SPH_C32(0x5d9b0557), SPH_C32(0x8fc944b3), + SPH_C32(0xcf1ccf0e), SPH_C32(0x746cd581) + }, { + SPH_C32(0xf7efc89d), SPH_C32(0x5dba5781), + SPH_C32(0x04016ce5), SPH_C32(0xad659c05), + SPH_C32(0x0306194f), SPH_C32(0x666d1836), + SPH_C32(0x24aa230a), SPH_C32(0x8b264ae7) + }, { + SPH_C32(0x858075d5), SPH_C32(0x36d79cce), + SPH_C32(0xe571f7d7), SPH_C32(0x204b1f67), + SPH_C32(0x35870c6a), SPH_C32(0x57e9e923), + SPH_C32(0x14bcb808), SPH_C32(0x7cde72ce) + }, { + SPH_C32(0x6c68e9be), SPH_C32(0x5ec41e22), + SPH_C32(0xc825b7c7), SPH_C32(0xaffb4363), + SPH_C32(0xf5df3999), SPH_C32(0x0fc688f1), + SPH_C32(0xb07224cc), SPH_C32(0x03e86cea) + } +}; + +static const sph_u32 RC00[8] = { + SPH_C32(0x303994a6), SPH_C32(0xc0e65299), + SPH_C32(0x6cc33a12), SPH_C32(0xdc56983e), + SPH_C32(0x1e00108f), SPH_C32(0x7800423d), + SPH_C32(0x8f5b7882), SPH_C32(0x96e1db12) +}; + +static const sph_u32 RC04[8] = { + SPH_C32(0xe0337818), SPH_C32(0x441ba90d), + SPH_C32(0x7f34d442), SPH_C32(0x9389217f), + SPH_C32(0xe5a8bce6), SPH_C32(0x5274baf4), + SPH_C32(0x26889ba7), SPH_C32(0x9a226e9d) +}; + +static const sph_u32 RC10[8] = { + SPH_C32(0xb6de10ed), SPH_C32(0x70f47aae), + SPH_C32(0x0707a3d4), SPH_C32(0x1c1e8f51), + SPH_C32(0x707a3d45), SPH_C32(0xaeb28562), + SPH_C32(0xbaca1589), SPH_C32(0x40a46f3e) +}; + +static const sph_u32 RC14[8] = { + SPH_C32(0x01685f3d), SPH_C32(0x05a17cf4), + SPH_C32(0xbd09caca), SPH_C32(0xf4272b28), + SPH_C32(0x144ae5cc), SPH_C32(0xfaa7ae2b), + SPH_C32(0x2e48f1c1), SPH_C32(0xb923c704) +}; + +#if SPH_LUFFA_PARALLEL + +static const sph_u64 RCW010[8] = { + SPH_C64(0xb6de10ed303994a6), SPH_C64(0x70f47aaec0e65299), + SPH_C64(0x0707a3d46cc33a12), SPH_C64(0x1c1e8f51dc56983e), + SPH_C64(0x707a3d451e00108f), SPH_C64(0xaeb285627800423d), + SPH_C64(0xbaca15898f5b7882), SPH_C64(0x40a46f3e96e1db12) +}; + +static const sph_u64 RCW014[8] = { + SPH_C64(0x01685f3de0337818), SPH_C64(0x05a17cf4441ba90d), + SPH_C64(0xbd09caca7f34d442), SPH_C64(0xf4272b289389217f), + SPH_C64(0x144ae5cce5a8bce6), SPH_C64(0xfaa7ae2b5274baf4), + SPH_C64(0x2e48f1c126889ba7), SPH_C64(0xb923c7049a226e9d) +}; + +#endif + +static const sph_u32 RC20[8] = { + SPH_C32(0xfc20d9d2), SPH_C32(0x34552e25), + SPH_C32(0x7ad8818f), SPH_C32(0x8438764a), + SPH_C32(0xbb6de032), SPH_C32(0xedb780c8), + SPH_C32(0xd9847356), SPH_C32(0xa2c78434) +}; + +static const sph_u32 RC24[8] = { + SPH_C32(0xe25e72c1), SPH_C32(0xe623bb72), + SPH_C32(0x5c58a4a4), SPH_C32(0x1e38e2e7), + SPH_C32(0x78e38b9d), SPH_C32(0x27586719), + SPH_C32(0x36eda57f), SPH_C32(0x703aace7) +}; + +static const sph_u32 RC30[8] = { + SPH_C32(0xb213afa5), SPH_C32(0xc84ebe95), + SPH_C32(0x4e608a22), SPH_C32(0x56d858fe), + SPH_C32(0x343b138f), SPH_C32(0xd0ec4e3d), + SPH_C32(0x2ceb4882), SPH_C32(0xb3ad2208) +}; + +static const sph_u32 RC34[8] = { + SPH_C32(0xe028c9bf), SPH_C32(0x44756f91), + SPH_C32(0x7e8fce32), SPH_C32(0x956548be), + SPH_C32(0xfe191be2), SPH_C32(0x3cb226e5), + SPH_C32(0x5944a28e), SPH_C32(0xa1c4c355) +}; + +#if SPH_LUFFA_PARALLEL + +static const sph_u64 RCW230[8] = { + SPH_C64(0xb213afa5fc20d9d2), SPH_C64(0xc84ebe9534552e25), + SPH_C64(0x4e608a227ad8818f), SPH_C64(0x56d858fe8438764a), + SPH_C64(0x343b138fbb6de032), SPH_C64(0xd0ec4e3dedb780c8), + SPH_C64(0x2ceb4882d9847356), SPH_C64(0xb3ad2208a2c78434) +}; + + +static const sph_u64 RCW234[8] = { + SPH_C64(0xe028c9bfe25e72c1), SPH_C64(0x44756f91e623bb72), + SPH_C64(0x7e8fce325c58a4a4), SPH_C64(0x956548be1e38e2e7), + SPH_C64(0xfe191be278e38b9d), SPH_C64(0x3cb226e527586719), + SPH_C64(0x5944a28e36eda57f), SPH_C64(0xa1c4c355703aace7) +}; + +#endif + +static const sph_u32 RC40[8] = { + SPH_C32(0xf0d2e9e3), SPH_C32(0xac11d7fa), + SPH_C32(0x1bcb66f2), SPH_C32(0x6f2d9bc9), + SPH_C32(0x78602649), SPH_C32(0x8edae952), + SPH_C32(0x3b6ba548), SPH_C32(0xedae9520) +}; + +static const sph_u32 RC44[8] = { + SPH_C32(0x5090d577), SPH_C32(0x2d1925ab), + SPH_C32(0xb46496ac), SPH_C32(0xd1925ab0), + SPH_C32(0x29131ab6), SPH_C32(0x0fc053c3), + SPH_C32(0x3f014f0c), SPH_C32(0xfc053c31) +}; + +#define DECL_TMP8(w) \ + sph_u32 w ## 0, w ## 1, w ## 2, w ## 3, w ## 4, w ## 5, w ## 6, w ## 7; + +#define M2(d, s) do { \ + sph_u32 tmp = s ## 7; \ + d ## 7 = s ## 6; \ + d ## 6 = s ## 5; \ + d ## 5 = s ## 4; \ + d ## 4 = s ## 3 ^ tmp; \ + d ## 3 = s ## 2 ^ tmp; \ + d ## 2 = s ## 1; \ + d ## 1 = s ## 0 ^ tmp; \ + d ## 0 = tmp; \ + } while (0) + +#define XOR(d, s1, s2) do { \ + d ## 0 = s1 ## 0 ^ s2 ## 0; \ + d ## 1 = s1 ## 1 ^ s2 ## 1; \ + d ## 2 = s1 ## 2 ^ s2 ## 2; \ + d ## 3 = s1 ## 3 ^ s2 ## 3; \ + d ## 4 = s1 ## 4 ^ s2 ## 4; \ + d ## 5 = s1 ## 5 ^ s2 ## 5; \ + d ## 6 = s1 ## 6 ^ s2 ## 6; \ + d ## 7 = s1 ## 7 ^ s2 ## 7; \ + } while (0) + +#if SPH_LUFFA_PARALLEL + +#define SUB_CRUMB_GEN(a0, a1, a2, a3, width) do { \ + sph_u ## width tmp; \ + tmp = (a0); \ + (a0) |= (a1); \ + (a2) ^= (a3); \ + (a1) = SPH_T ## width(~(a1)); \ + (a0) ^= (a3); \ + (a3) &= tmp; \ + (a1) ^= (a3); \ + (a3) ^= (a2); \ + (a2) &= (a0); \ + (a0) = SPH_T ## width(~(a0)); \ + (a2) ^= (a1); \ + (a1) |= (a3); \ + tmp ^= (a1); \ + (a3) ^= (a2); \ + (a2) &= (a1); \ + (a1) ^= (a0); \ + (a0) = tmp; \ + } while (0) + +#define SUB_CRUMB(a0, a1, a2, a3) SUB_CRUMB_GEN(a0, a1, a2, a3, 32) +#define SUB_CRUMBW(a0, a1, a2, a3) SUB_CRUMB_GEN(a0, a1, a2, a3, 64) + + +#if 0 + +#define ROL32W(x, n) SPH_T64( \ + (((x) << (n)) \ + & ~((SPH_C64(0xFFFFFFFF) >> (32 - (n))) << 32)) \ + | (((x) >> (32 - (n))) \ + & ~((SPH_C64(0xFFFFFFFF) >> (n)) << (n)))) + +#define MIX_WORDW(u, v) do { \ + (v) ^= (u); \ + (u) = ROL32W((u), 2) ^ (v); \ + (v) = ROL32W((v), 14) ^ (u); \ + (u) = ROL32W((u), 10) ^ (v); \ + (v) = ROL32W((v), 1); \ + } while (0) + +#endif + +#define MIX_WORDW(u, v) do { \ + sph_u32 ul, uh, vl, vh; \ + (v) ^= (u); \ + ul = SPH_T32((sph_u32)(u)); \ + uh = SPH_T32((sph_u32)((u) >> 32)); \ + vl = SPH_T32((sph_u32)(v)); \ + vh = SPH_T32((sph_u32)((v) >> 32)); \ + ul = SPH_ROTL32(ul, 2) ^ vl; \ + vl = SPH_ROTL32(vl, 14) ^ ul; \ + ul = SPH_ROTL32(ul, 10) ^ vl; \ + vl = SPH_ROTL32(vl, 1); \ + uh = SPH_ROTL32(uh, 2) ^ vh; \ + vh = SPH_ROTL32(vh, 14) ^ uh; \ + uh = SPH_ROTL32(uh, 10) ^ vh; \ + vh = SPH_ROTL32(vh, 1); \ + (u) = (sph_u64)ul | ((sph_u64)uh << 32); \ + (v) = (sph_u64)vl | ((sph_u64)vh << 32); \ + } while (0) + +#else + +#define SUB_CRUMB(a0, a1, a2, a3) do { \ + sph_u32 tmp; \ + tmp = (a0); \ + (a0) |= (a1); \ + (a2) ^= (a3); \ + (a1) = SPH_T32(~(a1)); \ + (a0) ^= (a3); \ + (a3) &= tmp; \ + (a1) ^= (a3); \ + (a3) ^= (a2); \ + (a2) &= (a0); \ + (a0) = SPH_T32(~(a0)); \ + (a2) ^= (a1); \ + (a1) |= (a3); \ + tmp ^= (a1); \ + (a3) ^= (a2); \ + (a2) &= (a1); \ + (a1) ^= (a0); \ + (a0) = tmp; \ + } while (0) + +#endif + +#define MIX_WORD(u, v) do { \ + (v) ^= (u); \ + (u) = SPH_ROTL32((u), 2) ^ (v); \ + (v) = SPH_ROTL32((v), 14) ^ (u); \ + (u) = SPH_ROTL32((u), 10) ^ (v); \ + (v) = SPH_ROTL32((v), 1); \ + } while (0) + +#define DECL_STATE3 \ + sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \ + sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \ + sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; + +#define READ_STATE3(state) do { \ + V00 = (state)->V[0][0]; \ + V01 = (state)->V[0][1]; \ + V02 = (state)->V[0][2]; \ + V03 = (state)->V[0][3]; \ + V04 = (state)->V[0][4]; \ + V05 = (state)->V[0][5]; \ + V06 = (state)->V[0][6]; \ + V07 = (state)->V[0][7]; \ + V10 = (state)->V[1][0]; \ + V11 = (state)->V[1][1]; \ + V12 = (state)->V[1][2]; \ + V13 = (state)->V[1][3]; \ + V14 = (state)->V[1][4]; \ + V15 = (state)->V[1][5]; \ + V16 = (state)->V[1][6]; \ + V17 = (state)->V[1][7]; \ + V20 = (state)->V[2][0]; \ + V21 = (state)->V[2][1]; \ + V22 = (state)->V[2][2]; \ + V23 = (state)->V[2][3]; \ + V24 = (state)->V[2][4]; \ + V25 = (state)->V[2][5]; \ + V26 = (state)->V[2][6]; \ + V27 = (state)->V[2][7]; \ + } while (0) + +#define WRITE_STATE3(state) do { \ + (state)->V[0][0] = V00; \ + (state)->V[0][1] = V01; \ + (state)->V[0][2] = V02; \ + (state)->V[0][3] = V03; \ + (state)->V[0][4] = V04; \ + (state)->V[0][5] = V05; \ + (state)->V[0][6] = V06; \ + (state)->V[0][7] = V07; \ + (state)->V[1][0] = V10; \ + (state)->V[1][1] = V11; \ + (state)->V[1][2] = V12; \ + (state)->V[1][3] = V13; \ + (state)->V[1][4] = V14; \ + (state)->V[1][5] = V15; \ + (state)->V[1][6] = V16; \ + (state)->V[1][7] = V17; \ + (state)->V[2][0] = V20; \ + (state)->V[2][1] = V21; \ + (state)->V[2][2] = V22; \ + (state)->V[2][3] = V23; \ + (state)->V[2][4] = V24; \ + (state)->V[2][5] = V25; \ + (state)->V[2][6] = V26; \ + (state)->V[2][7] = V27; \ + } while (0) + +#define MI3 do { \ + DECL_TMP8(M) \ + DECL_TMP8(a) \ + M0 = sph_dec32be_aligned(buf + 0); \ + M1 = sph_dec32be_aligned(buf + 4); \ + M2 = sph_dec32be_aligned(buf + 8); \ + M3 = sph_dec32be_aligned(buf + 12); \ + M4 = sph_dec32be_aligned(buf + 16); \ + M5 = sph_dec32be_aligned(buf + 20); \ + M6 = sph_dec32be_aligned(buf + 24); \ + M7 = sph_dec32be_aligned(buf + 28); \ + XOR(a, V0, V1); \ + XOR(a, a, V2); \ + M2(a, a); \ + XOR(V0, a, V0); \ + XOR(V0, M, V0); \ + M2(M, M); \ + XOR(V1, a, V1); \ + XOR(V1, M, V1); \ + M2(M, M); \ + XOR(V2, a, V2); \ + XOR(V2, M, V2); \ + } while (0) + +#define TWEAK3 do { \ + V14 = SPH_ROTL32(V14, 1); \ + V15 = SPH_ROTL32(V15, 1); \ + V16 = SPH_ROTL32(V16, 1); \ + V17 = SPH_ROTL32(V17, 1); \ + V24 = SPH_ROTL32(V24, 2); \ + V25 = SPH_ROTL32(V25, 2); \ + V26 = SPH_ROTL32(V26, 2); \ + V27 = SPH_ROTL32(V27, 2); \ + } while (0) + +#if SPH_LUFFA_PARALLEL + +#define P3 do { \ + int r; \ + sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \ + TWEAK3; \ + W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \ + W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \ + W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \ + W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \ + W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \ + W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \ + W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \ + W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW010[r]; \ + W4 ^= RCW014[r]; \ + } \ + V00 = SPH_T32((sph_u32)W0); \ + V10 = SPH_T32((sph_u32)(W0 >> 32)); \ + V01 = SPH_T32((sph_u32)W1); \ + V11 = SPH_T32((sph_u32)(W1 >> 32)); \ + V02 = SPH_T32((sph_u32)W2); \ + V12 = SPH_T32((sph_u32)(W2 >> 32)); \ + V03 = SPH_T32((sph_u32)W3); \ + V13 = SPH_T32((sph_u32)(W3 >> 32)); \ + V04 = SPH_T32((sph_u32)W4); \ + V14 = SPH_T32((sph_u32)(W4 >> 32)); \ + V05 = SPH_T32((sph_u32)W5); \ + V15 = SPH_T32((sph_u32)(W5 >> 32)); \ + V06 = SPH_T32((sph_u32)W6); \ + V16 = SPH_T32((sph_u32)(W6 >> 32)); \ + V07 = SPH_T32((sph_u32)W7); \ + V17 = SPH_T32((sph_u32)(W7 >> 32)); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V20, V21, V22, V23); \ + SUB_CRUMB(V25, V26, V27, V24); \ + MIX_WORD(V20, V24); \ + MIX_WORD(V21, V25); \ + MIX_WORD(V22, V26); \ + MIX_WORD(V23, V27); \ + V20 ^= RC20[r]; \ + V24 ^= RC24[r]; \ + } \ + } while (0) + +#else + +#define P3 do { \ + int r; \ + TWEAK3; \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V00, V01, V02, V03); \ + SUB_CRUMB(V05, V06, V07, V04); \ + MIX_WORD(V00, V04); \ + MIX_WORD(V01, V05); \ + MIX_WORD(V02, V06); \ + MIX_WORD(V03, V07); \ + V00 ^= RC00[r]; \ + V04 ^= RC04[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V10, V11, V12, V13); \ + SUB_CRUMB(V15, V16, V17, V14); \ + MIX_WORD(V10, V14); \ + MIX_WORD(V11, V15); \ + MIX_WORD(V12, V16); \ + MIX_WORD(V13, V17); \ + V10 ^= RC10[r]; \ + V14 ^= RC14[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V20, V21, V22, V23); \ + SUB_CRUMB(V25, V26, V27, V24); \ + MIX_WORD(V20, V24); \ + MIX_WORD(V21, V25); \ + MIX_WORD(V22, V26); \ + MIX_WORD(V23, V27); \ + V20 ^= RC20[r]; \ + V24 ^= RC24[r]; \ + } \ + } while (0) + +#endif + +#define DECL_STATE4 \ + sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \ + sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \ + sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \ + sph_u32 V30, V31, V32, V33, V34, V35, V36, V37; + +#define READ_STATE4(state) do { \ + V00 = (state)->V[0][0]; \ + V01 = (state)->V[0][1]; \ + V02 = (state)->V[0][2]; \ + V03 = (state)->V[0][3]; \ + V04 = (state)->V[0][4]; \ + V05 = (state)->V[0][5]; \ + V06 = (state)->V[0][6]; \ + V07 = (state)->V[0][7]; \ + V10 = (state)->V[1][0]; \ + V11 = (state)->V[1][1]; \ + V12 = (state)->V[1][2]; \ + V13 = (state)->V[1][3]; \ + V14 = (state)->V[1][4]; \ + V15 = (state)->V[1][5]; \ + V16 = (state)->V[1][6]; \ + V17 = (state)->V[1][7]; \ + V20 = (state)->V[2][0]; \ + V21 = (state)->V[2][1]; \ + V22 = (state)->V[2][2]; \ + V23 = (state)->V[2][3]; \ + V24 = (state)->V[2][4]; \ + V25 = (state)->V[2][5]; \ + V26 = (state)->V[2][6]; \ + V27 = (state)->V[2][7]; \ + V30 = (state)->V[3][0]; \ + V31 = (state)->V[3][1]; \ + V32 = (state)->V[3][2]; \ + V33 = (state)->V[3][3]; \ + V34 = (state)->V[3][4]; \ + V35 = (state)->V[3][5]; \ + V36 = (state)->V[3][6]; \ + V37 = (state)->V[3][7]; \ + } while (0) + +#define WRITE_STATE4(state) do { \ + (state)->V[0][0] = V00; \ + (state)->V[0][1] = V01; \ + (state)->V[0][2] = V02; \ + (state)->V[0][3] = V03; \ + (state)->V[0][4] = V04; \ + (state)->V[0][5] = V05; \ + (state)->V[0][6] = V06; \ + (state)->V[0][7] = V07; \ + (state)->V[1][0] = V10; \ + (state)->V[1][1] = V11; \ + (state)->V[1][2] = V12; \ + (state)->V[1][3] = V13; \ + (state)->V[1][4] = V14; \ + (state)->V[1][5] = V15; \ + (state)->V[1][6] = V16; \ + (state)->V[1][7] = V17; \ + (state)->V[2][0] = V20; \ + (state)->V[2][1] = V21; \ + (state)->V[2][2] = V22; \ + (state)->V[2][3] = V23; \ + (state)->V[2][4] = V24; \ + (state)->V[2][5] = V25; \ + (state)->V[2][6] = V26; \ + (state)->V[2][7] = V27; \ + (state)->V[3][0] = V30; \ + (state)->V[3][1] = V31; \ + (state)->V[3][2] = V32; \ + (state)->V[3][3] = V33; \ + (state)->V[3][4] = V34; \ + (state)->V[3][5] = V35; \ + (state)->V[3][6] = V36; \ + (state)->V[3][7] = V37; \ + } while (0) + +#define MI4 do { \ + DECL_TMP8(M) \ + DECL_TMP8(a) \ + DECL_TMP8(b) \ + M0 = sph_dec32be_aligned(buf + 0); \ + M1 = sph_dec32be_aligned(buf + 4); \ + M2 = sph_dec32be_aligned(buf + 8); \ + M3 = sph_dec32be_aligned(buf + 12); \ + M4 = sph_dec32be_aligned(buf + 16); \ + M5 = sph_dec32be_aligned(buf + 20); \ + M6 = sph_dec32be_aligned(buf + 24); \ + M7 = sph_dec32be_aligned(buf + 28); \ + XOR(a, V0, V1); \ + XOR(b, V2, V3); \ + XOR(a, a, b); \ + M2(a, a); \ + XOR(V0, a, V0); \ + XOR(V1, a, V1); \ + XOR(V2, a, V2); \ + XOR(V3, a, V3); \ + M2(b, V0); \ + XOR(b, b, V3); \ + M2(V3, V3); \ + XOR(V3, V3, V2); \ + M2(V2, V2); \ + XOR(V2, V2, V1); \ + M2(V1, V1); \ + XOR(V1, V1, V0); \ + XOR(V0, b, M); \ + M2(M, M); \ + XOR(V1, V1, M); \ + M2(M, M); \ + XOR(V2, V2, M); \ + M2(M, M); \ + XOR(V3, V3, M); \ + } while (0) + +#define TWEAK4 do { \ + V14 = SPH_ROTL32(V14, 1); \ + V15 = SPH_ROTL32(V15, 1); \ + V16 = SPH_ROTL32(V16, 1); \ + V17 = SPH_ROTL32(V17, 1); \ + V24 = SPH_ROTL32(V24, 2); \ + V25 = SPH_ROTL32(V25, 2); \ + V26 = SPH_ROTL32(V26, 2); \ + V27 = SPH_ROTL32(V27, 2); \ + V34 = SPH_ROTL32(V34, 3); \ + V35 = SPH_ROTL32(V35, 3); \ + V36 = SPH_ROTL32(V36, 3); \ + V37 = SPH_ROTL32(V37, 3); \ + } while (0) + +#if SPH_LUFFA_PARALLEL + +#define P4 do { \ + int r; \ + sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \ + TWEAK4; \ + W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \ + W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \ + W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \ + W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \ + W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \ + W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \ + W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \ + W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW010[r]; \ + W4 ^= RCW014[r]; \ + } \ + V00 = SPH_T32((sph_u32)W0); \ + V10 = SPH_T32((sph_u32)(W0 >> 32)); \ + V01 = SPH_T32((sph_u32)W1); \ + V11 = SPH_T32((sph_u32)(W1 >> 32)); \ + V02 = SPH_T32((sph_u32)W2); \ + V12 = SPH_T32((sph_u32)(W2 >> 32)); \ + V03 = SPH_T32((sph_u32)W3); \ + V13 = SPH_T32((sph_u32)(W3 >> 32)); \ + V04 = SPH_T32((sph_u32)W4); \ + V14 = SPH_T32((sph_u32)(W4 >> 32)); \ + V05 = SPH_T32((sph_u32)W5); \ + V15 = SPH_T32((sph_u32)(W5 >> 32)); \ + V06 = SPH_T32((sph_u32)W6); \ + V16 = SPH_T32((sph_u32)(W6 >> 32)); \ + V07 = SPH_T32((sph_u32)W7); \ + V17 = SPH_T32((sph_u32)(W7 >> 32)); \ + W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \ + W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \ + W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \ + W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \ + W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \ + W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \ + W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \ + W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW230[r]; \ + W4 ^= RCW234[r]; \ + } \ + V20 = SPH_T32((sph_u32)W0); \ + V30 = SPH_T32((sph_u32)(W0 >> 32)); \ + V21 = SPH_T32((sph_u32)W1); \ + V31 = SPH_T32((sph_u32)(W1 >> 32)); \ + V22 = SPH_T32((sph_u32)W2); \ + V32 = SPH_T32((sph_u32)(W2 >> 32)); \ + V23 = SPH_T32((sph_u32)W3); \ + V33 = SPH_T32((sph_u32)(W3 >> 32)); \ + V24 = SPH_T32((sph_u32)W4); \ + V34 = SPH_T32((sph_u32)(W4 >> 32)); \ + V25 = SPH_T32((sph_u32)W5); \ + V35 = SPH_T32((sph_u32)(W5 >> 32)); \ + V26 = SPH_T32((sph_u32)W6); \ + V36 = SPH_T32((sph_u32)(W6 >> 32)); \ + V27 = SPH_T32((sph_u32)W7); \ + V37 = SPH_T32((sph_u32)(W7 >> 32)); \ + } while (0) + +#else + +#define P4 do { \ + int r; \ + TWEAK4; \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V00, V01, V02, V03); \ + SUB_CRUMB(V05, V06, V07, V04); \ + MIX_WORD(V00, V04); \ + MIX_WORD(V01, V05); \ + MIX_WORD(V02, V06); \ + MIX_WORD(V03, V07); \ + V00 ^= RC00[r]; \ + V04 ^= RC04[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V10, V11, V12, V13); \ + SUB_CRUMB(V15, V16, V17, V14); \ + MIX_WORD(V10, V14); \ + MIX_WORD(V11, V15); \ + MIX_WORD(V12, V16); \ + MIX_WORD(V13, V17); \ + V10 ^= RC10[r]; \ + V14 ^= RC14[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V20, V21, V22, V23); \ + SUB_CRUMB(V25, V26, V27, V24); \ + MIX_WORD(V20, V24); \ + MIX_WORD(V21, V25); \ + MIX_WORD(V22, V26); \ + MIX_WORD(V23, V27); \ + V20 ^= RC20[r]; \ + V24 ^= RC24[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V30, V31, V32, V33); \ + SUB_CRUMB(V35, V36, V37, V34); \ + MIX_WORD(V30, V34); \ + MIX_WORD(V31, V35); \ + MIX_WORD(V32, V36); \ + MIX_WORD(V33, V37); \ + V30 ^= RC30[r]; \ + V34 ^= RC34[r]; \ + } \ + } while (0) + +#endif + +#define DECL_STATE5 \ + sph_u32 V00, V01, V02, V03, V04, V05, V06, V07; \ + sph_u32 V10, V11, V12, V13, V14, V15, V16, V17; \ + sph_u32 V20, V21, V22, V23, V24, V25, V26, V27; \ + sph_u32 V30, V31, V32, V33, V34, V35, V36, V37; \ + sph_u32 V40, V41, V42, V43, V44, V45, V46, V47; + +#define READ_STATE5(state) do { \ + V00 = (state)->V[0][0]; \ + V01 = (state)->V[0][1]; \ + V02 = (state)->V[0][2]; \ + V03 = (state)->V[0][3]; \ + V04 = (state)->V[0][4]; \ + V05 = (state)->V[0][5]; \ + V06 = (state)->V[0][6]; \ + V07 = (state)->V[0][7]; \ + V10 = (state)->V[1][0]; \ + V11 = (state)->V[1][1]; \ + V12 = (state)->V[1][2]; \ + V13 = (state)->V[1][3]; \ + V14 = (state)->V[1][4]; \ + V15 = (state)->V[1][5]; \ + V16 = (state)->V[1][6]; \ + V17 = (state)->V[1][7]; \ + V20 = (state)->V[2][0]; \ + V21 = (state)->V[2][1]; \ + V22 = (state)->V[2][2]; \ + V23 = (state)->V[2][3]; \ + V24 = (state)->V[2][4]; \ + V25 = (state)->V[2][5]; \ + V26 = (state)->V[2][6]; \ + V27 = (state)->V[2][7]; \ + V30 = (state)->V[3][0]; \ + V31 = (state)->V[3][1]; \ + V32 = (state)->V[3][2]; \ + V33 = (state)->V[3][3]; \ + V34 = (state)->V[3][4]; \ + V35 = (state)->V[3][5]; \ + V36 = (state)->V[3][6]; \ + V37 = (state)->V[3][7]; \ + V40 = (state)->V[4][0]; \ + V41 = (state)->V[4][1]; \ + V42 = (state)->V[4][2]; \ + V43 = (state)->V[4][3]; \ + V44 = (state)->V[4][4]; \ + V45 = (state)->V[4][5]; \ + V46 = (state)->V[4][6]; \ + V47 = (state)->V[4][7]; \ + } while (0) + +#define WRITE_STATE5(state) do { \ + (state)->V[0][0] = V00; \ + (state)->V[0][1] = V01; \ + (state)->V[0][2] = V02; \ + (state)->V[0][3] = V03; \ + (state)->V[0][4] = V04; \ + (state)->V[0][5] = V05; \ + (state)->V[0][6] = V06; \ + (state)->V[0][7] = V07; \ + (state)->V[1][0] = V10; \ + (state)->V[1][1] = V11; \ + (state)->V[1][2] = V12; \ + (state)->V[1][3] = V13; \ + (state)->V[1][4] = V14; \ + (state)->V[1][5] = V15; \ + (state)->V[1][6] = V16; \ + (state)->V[1][7] = V17; \ + (state)->V[2][0] = V20; \ + (state)->V[2][1] = V21; \ + (state)->V[2][2] = V22; \ + (state)->V[2][3] = V23; \ + (state)->V[2][4] = V24; \ + (state)->V[2][5] = V25; \ + (state)->V[2][6] = V26; \ + (state)->V[2][7] = V27; \ + (state)->V[3][0] = V30; \ + (state)->V[3][1] = V31; \ + (state)->V[3][2] = V32; \ + (state)->V[3][3] = V33; \ + (state)->V[3][4] = V34; \ + (state)->V[3][5] = V35; \ + (state)->V[3][6] = V36; \ + (state)->V[3][7] = V37; \ + (state)->V[4][0] = V40; \ + (state)->V[4][1] = V41; \ + (state)->V[4][2] = V42; \ + (state)->V[4][3] = V43; \ + (state)->V[4][4] = V44; \ + (state)->V[4][5] = V45; \ + (state)->V[4][6] = V46; \ + (state)->V[4][7] = V47; \ + } while (0) + +#define MI5 do { \ + DECL_TMP8(M) \ + DECL_TMP8(a) \ + DECL_TMP8(b) \ + M0 = sph_dec32be_aligned(buf + 0); \ + M1 = sph_dec32be_aligned(buf + 4); \ + M2 = sph_dec32be_aligned(buf + 8); \ + M3 = sph_dec32be_aligned(buf + 12); \ + M4 = sph_dec32be_aligned(buf + 16); \ + M5 = sph_dec32be_aligned(buf + 20); \ + M6 = sph_dec32be_aligned(buf + 24); \ + M7 = sph_dec32be_aligned(buf + 28); \ + XOR(a, V0, V1); \ + XOR(b, V2, V3); \ + XOR(a, a, b); \ + XOR(a, a, V4); \ + M2(a, a); \ + XOR(V0, a, V0); \ + XOR(V1, a, V1); \ + XOR(V2, a, V2); \ + XOR(V3, a, V3); \ + XOR(V4, a, V4); \ + M2(b, V0); \ + XOR(b, b, V1); \ + M2(V1, V1); \ + XOR(V1, V1, V2); \ + M2(V2, V2); \ + XOR(V2, V2, V3); \ + M2(V3, V3); \ + XOR(V3, V3, V4); \ + M2(V4, V4); \ + XOR(V4, V4, V0); \ + M2(V0, b); \ + XOR(V0, V0, V4); \ + M2(V4, V4); \ + XOR(V4, V4, V3); \ + M2(V3, V3); \ + XOR(V3, V3, V2); \ + M2(V2, V2); \ + XOR(V2, V2, V1); \ + M2(V1, V1); \ + XOR(V1, V1, b); \ + XOR(V0, V0, M); \ + M2(M, M); \ + XOR(V1, V1, M); \ + M2(M, M); \ + XOR(V2, V2, M); \ + M2(M, M); \ + XOR(V3, V3, M); \ + M2(M, M); \ + XOR(V4, V4, M); \ + } while (0) + +#define TWEAK5 do { \ + V14 = SPH_ROTL32(V14, 1); \ + V15 = SPH_ROTL32(V15, 1); \ + V16 = SPH_ROTL32(V16, 1); \ + V17 = SPH_ROTL32(V17, 1); \ + V24 = SPH_ROTL32(V24, 2); \ + V25 = SPH_ROTL32(V25, 2); \ + V26 = SPH_ROTL32(V26, 2); \ + V27 = SPH_ROTL32(V27, 2); \ + V34 = SPH_ROTL32(V34, 3); \ + V35 = SPH_ROTL32(V35, 3); \ + V36 = SPH_ROTL32(V36, 3); \ + V37 = SPH_ROTL32(V37, 3); \ + V44 = SPH_ROTL32(V44, 4); \ + V45 = SPH_ROTL32(V45, 4); \ + V46 = SPH_ROTL32(V46, 4); \ + V47 = SPH_ROTL32(V47, 4); \ + } while (0) + +#if SPH_LUFFA_PARALLEL + +#define P5 do { \ + int r; \ + sph_u64 W0, W1, W2, W3, W4, W5, W6, W7; \ + TWEAK5; \ + W0 = (sph_u64)V00 | ((sph_u64)V10 << 32); \ + W1 = (sph_u64)V01 | ((sph_u64)V11 << 32); \ + W2 = (sph_u64)V02 | ((sph_u64)V12 << 32); \ + W3 = (sph_u64)V03 | ((sph_u64)V13 << 32); \ + W4 = (sph_u64)V04 | ((sph_u64)V14 << 32); \ + W5 = (sph_u64)V05 | ((sph_u64)V15 << 32); \ + W6 = (sph_u64)V06 | ((sph_u64)V16 << 32); \ + W7 = (sph_u64)V07 | ((sph_u64)V17 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW010[r]; \ + W4 ^= RCW014[r]; \ + } \ + V00 = SPH_T32((sph_u32)W0); \ + V10 = SPH_T32((sph_u32)(W0 >> 32)); \ + V01 = SPH_T32((sph_u32)W1); \ + V11 = SPH_T32((sph_u32)(W1 >> 32)); \ + V02 = SPH_T32((sph_u32)W2); \ + V12 = SPH_T32((sph_u32)(W2 >> 32)); \ + V03 = SPH_T32((sph_u32)W3); \ + V13 = SPH_T32((sph_u32)(W3 >> 32)); \ + V04 = SPH_T32((sph_u32)W4); \ + V14 = SPH_T32((sph_u32)(W4 >> 32)); \ + V05 = SPH_T32((sph_u32)W5); \ + V15 = SPH_T32((sph_u32)(W5 >> 32)); \ + V06 = SPH_T32((sph_u32)W6); \ + V16 = SPH_T32((sph_u32)(W6 >> 32)); \ + V07 = SPH_T32((sph_u32)W7); \ + V17 = SPH_T32((sph_u32)(W7 >> 32)); \ + W0 = (sph_u64)V20 | ((sph_u64)V30 << 32); \ + W1 = (sph_u64)V21 | ((sph_u64)V31 << 32); \ + W2 = (sph_u64)V22 | ((sph_u64)V32 << 32); \ + W3 = (sph_u64)V23 | ((sph_u64)V33 << 32); \ + W4 = (sph_u64)V24 | ((sph_u64)V34 << 32); \ + W5 = (sph_u64)V25 | ((sph_u64)V35 << 32); \ + W6 = (sph_u64)V26 | ((sph_u64)V36 << 32); \ + W7 = (sph_u64)V27 | ((sph_u64)V37 << 32); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMBW(W0, W1, W2, W3); \ + SUB_CRUMBW(W5, W6, W7, W4); \ + MIX_WORDW(W0, W4); \ + MIX_WORDW(W1, W5); \ + MIX_WORDW(W2, W6); \ + MIX_WORDW(W3, W7); \ + W0 ^= RCW230[r]; \ + W4 ^= RCW234[r]; \ + } \ + V20 = SPH_T32((sph_u32)W0); \ + V30 = SPH_T32((sph_u32)(W0 >> 32)); \ + V21 = SPH_T32((sph_u32)W1); \ + V31 = SPH_T32((sph_u32)(W1 >> 32)); \ + V22 = SPH_T32((sph_u32)W2); \ + V32 = SPH_T32((sph_u32)(W2 >> 32)); \ + V23 = SPH_T32((sph_u32)W3); \ + V33 = SPH_T32((sph_u32)(W3 >> 32)); \ + V24 = SPH_T32((sph_u32)W4); \ + V34 = SPH_T32((sph_u32)(W4 >> 32)); \ + V25 = SPH_T32((sph_u32)W5); \ + V35 = SPH_T32((sph_u32)(W5 >> 32)); \ + V26 = SPH_T32((sph_u32)W6); \ + V36 = SPH_T32((sph_u32)(W6 >> 32)); \ + V27 = SPH_T32((sph_u32)W7); \ + V37 = SPH_T32((sph_u32)(W7 >> 32)); \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V40, V41, V42, V43); \ + SUB_CRUMB(V45, V46, V47, V44); \ + MIX_WORD(V40, V44); \ + MIX_WORD(V41, V45); \ + MIX_WORD(V42, V46); \ + MIX_WORD(V43, V47); \ + V40 ^= RC40[r]; \ + V44 ^= RC44[r]; \ + } \ + } while (0) + +#else + +#define P5 do { \ + int r; \ + TWEAK5; \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V00, V01, V02, V03); \ + SUB_CRUMB(V05, V06, V07, V04); \ + MIX_WORD(V00, V04); \ + MIX_WORD(V01, V05); \ + MIX_WORD(V02, V06); \ + MIX_WORD(V03, V07); \ + V00 ^= RC00[r]; \ + V04 ^= RC04[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V10, V11, V12, V13); \ + SUB_CRUMB(V15, V16, V17, V14); \ + MIX_WORD(V10, V14); \ + MIX_WORD(V11, V15); \ + MIX_WORD(V12, V16); \ + MIX_WORD(V13, V17); \ + V10 ^= RC10[r]; \ + V14 ^= RC14[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V20, V21, V22, V23); \ + SUB_CRUMB(V25, V26, V27, V24); \ + MIX_WORD(V20, V24); \ + MIX_WORD(V21, V25); \ + MIX_WORD(V22, V26); \ + MIX_WORD(V23, V27); \ + V20 ^= RC20[r]; \ + V24 ^= RC24[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V30, V31, V32, V33); \ + SUB_CRUMB(V35, V36, V37, V34); \ + MIX_WORD(V30, V34); \ + MIX_WORD(V31, V35); \ + MIX_WORD(V32, V36); \ + MIX_WORD(V33, V37); \ + V30 ^= RC30[r]; \ + V34 ^= RC34[r]; \ + } \ + for (r = 0; r < 8; r ++) { \ + SUB_CRUMB(V40, V41, V42, V43); \ + SUB_CRUMB(V45, V46, V47, V44); \ + MIX_WORD(V40, V44); \ + MIX_WORD(V41, V45); \ + MIX_WORD(V42, V46); \ + MIX_WORD(V43, V47); \ + V40 ^= RC40[r]; \ + V44 ^= RC44[r]; \ + } \ + } while (0) + +#endif + +static void +luffa3(sph_luffa224_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE3 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE3(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + MI3; + P3; + ptr = 0; + } + } + WRITE_STATE3(sc); + sc->ptr = ptr; +} + +static void +luffa3_close(sph_luffa224_context *sc, unsigned ub, unsigned n, + void *dst, unsigned out_size_w32) +{ + unsigned char *buf, *out; + size_t ptr; + unsigned z; + int i; + DECL_STATE3 + + buf = sc->buf; + ptr = sc->ptr; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + READ_STATE3(sc); + for (i = 0; i < 2; i ++) { + MI3; + P3; + memset(buf, 0, sizeof sc->buf); + } + out = dst; + sph_enc32be(out + 0, V00 ^ V10 ^ V20); + sph_enc32be(out + 4, V01 ^ V11 ^ V21); + sph_enc32be(out + 8, V02 ^ V12 ^ V22); + sph_enc32be(out + 12, V03 ^ V13 ^ V23); + sph_enc32be(out + 16, V04 ^ V14 ^ V24); + sph_enc32be(out + 20, V05 ^ V15 ^ V25); + sph_enc32be(out + 24, V06 ^ V16 ^ V26); + if (out_size_w32 > 7) + sph_enc32be(out + 28, V07 ^ V17 ^ V27); +} + +static void +luffa4(sph_luffa384_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE4 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE4(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + MI4; + P4; + ptr = 0; + } + } + WRITE_STATE4(sc); + sc->ptr = ptr; +} + +static void +luffa4_close(sph_luffa384_context *sc, unsigned ub, unsigned n, void *dst) +{ + unsigned char *buf, *out; + size_t ptr; + unsigned z; + int i; + DECL_STATE4 + + buf = sc->buf; + ptr = sc->ptr; + out = dst; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + READ_STATE4(sc); + for (i = 0; i < 3; i ++) { + MI4; + P4; + switch (i) { + case 0: + memset(buf, 0, sizeof sc->buf); + break; + case 1: + sph_enc32be(out + 0, V00 ^ V10 ^ V20 ^ V30); + sph_enc32be(out + 4, V01 ^ V11 ^ V21 ^ V31); + sph_enc32be(out + 8, V02 ^ V12 ^ V22 ^ V32); + sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33); + sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34); + sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35); + sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36); + sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37); + break; + case 2: + sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30); + sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31); + sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32); + sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33); + break; + } + } +} + +static void +luffa5(sph_luffa512_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE5 + + buf = sc->buf; + ptr = sc->ptr; + if (len < (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE5(sc); + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == sizeof sc->buf) { + MI5; + P5; + ptr = 0; + } + } + WRITE_STATE5(sc); + sc->ptr = ptr; +} + +static void +luffa5_close(sph_luffa512_context *sc, unsigned ub, unsigned n, void *dst) +{ + unsigned char *buf, *out; + size_t ptr; + unsigned z; + int i; + DECL_STATE5 + + buf = sc->buf; + ptr = sc->ptr; + out = dst; + z = 0x80 >> n; + buf[ptr ++] = ((ub & -z) | z) & 0xFF; + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + READ_STATE5(sc); + for (i = 0; i < 3; i ++) { + MI5; + P5; + switch (i) { + case 0: + memset(buf, 0, sizeof sc->buf); + break; + case 1: + sph_enc32be(out + 0, V00 ^ V10 ^ V20 ^ V30 ^ V40); + sph_enc32be(out + 4, V01 ^ V11 ^ V21 ^ V31 ^ V41); + sph_enc32be(out + 8, V02 ^ V12 ^ V22 ^ V32 ^ V42); + sph_enc32be(out + 12, V03 ^ V13 ^ V23 ^ V33 ^ V43); + sph_enc32be(out + 16, V04 ^ V14 ^ V24 ^ V34 ^ V44); + sph_enc32be(out + 20, V05 ^ V15 ^ V25 ^ V35 ^ V45); + sph_enc32be(out + 24, V06 ^ V16 ^ V26 ^ V36 ^ V46); + sph_enc32be(out + 28, V07 ^ V17 ^ V27 ^ V37 ^ V47); + break; + case 2: + sph_enc32be(out + 32, V00 ^ V10 ^ V20 ^ V30 ^ V40); + sph_enc32be(out + 36, V01 ^ V11 ^ V21 ^ V31 ^ V41); + sph_enc32be(out + 40, V02 ^ V12 ^ V22 ^ V32 ^ V42); + sph_enc32be(out + 44, V03 ^ V13 ^ V23 ^ V33 ^ V43); + sph_enc32be(out + 48, V04 ^ V14 ^ V24 ^ V34 ^ V44); + sph_enc32be(out + 52, V05 ^ V15 ^ V25 ^ V35 ^ V45); + sph_enc32be(out + 56, V06 ^ V16 ^ V26 ^ V36 ^ V46); + sph_enc32be(out + 60, V07 ^ V17 ^ V27 ^ V37 ^ V47); + break; + } + } +} + +/* see sph_luffa.h */ +void +sph_luffa224_init(void *cc) +{ + sph_luffa224_context *sc; + + sc = cc; + memcpy(sc->V, V_INIT, sizeof(sc->V)); + sc->ptr = 0; +} + +/* see sph_luffa.h */ +void +sph_luffa224(void *cc, const void *data, size_t len) +{ + luffa3(cc, data, len); +} + +/* see sph_luffa.h */ +void +sph_luffa224_close(void *cc, void *dst) +{ + sph_luffa224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_luffa.h */ +void +sph_luffa224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + luffa3_close(cc, ub, n, dst, 7); + sph_luffa224_init(cc); +} + +/* see sph_luffa.h */ +void +sph_luffa256_init(void *cc) +{ + sph_luffa256_context *sc; + + sc = cc; + memcpy(sc->V, V_INIT, sizeof(sc->V)); + sc->ptr = 0; +} + +/* see sph_luffa.h */ +void +sph_luffa256(void *cc, const void *data, size_t len) +{ + luffa3(cc, data, len); +} + +/* see sph_luffa.h */ +void +sph_luffa256_close(void *cc, void *dst) +{ + sph_luffa256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_luffa.h */ +void +sph_luffa256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + luffa3_close(cc, ub, n, dst, 8); + sph_luffa256_init(cc); +} + +/* see sph_luffa.h */ +void +sph_luffa384_init(void *cc) +{ + sph_luffa384_context *sc; + + sc = cc; + memcpy(sc->V, V_INIT, sizeof(sc->V)); + sc->ptr = 0; +} + +/* see sph_luffa.h */ +void +sph_luffa384(void *cc, const void *data, size_t len) +{ + luffa4(cc, data, len); +} + +/* see sph_luffa.h */ +void +sph_luffa384_close(void *cc, void *dst) +{ + sph_luffa384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_luffa.h */ +void +sph_luffa384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + luffa4_close(cc, ub, n, dst); + sph_luffa384_init(cc); +} + +/* see sph_luffa.h */ +void +sph_luffa512_init(void *cc) +{ + sph_luffa512_context *sc; + + sc = cc; + memcpy(sc->V, V_INIT, sizeof(sc->V)); + sc->ptr = 0; +} + +/* see sph_luffa.h */ +void +sph_luffa512(void *cc, const void *data, size_t len) +{ + luffa5(cc, data, len); +} + +/* see sph_luffa.h */ +void +sph_luffa512_close(void *cc, void *dst) +{ + sph_luffa512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_luffa.h */ +void +sph_luffa512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + luffa5_close(cc, ub, n, dst); + sph_luffa512_init(cc); +} + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/sha3/sph_luffa.h b/sha3/sph_luffa.h new file mode 100644 index 00000000..a32fd7b1 --- /dev/null +++ b/sha3/sph_luffa.h @@ -0,0 +1,296 @@ +/* $Id: sph_luffa.h 154 2010-04-26 17:00:24Z tp $ */ +/** + * Luffa interface. Luffa is a family of functions which differ by + * their output size; this implementation defines Luffa for output + * sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_luffa.h + * @author Thomas Pornin + */ + +#ifndef SPH_LUFFA_H__ +#define SPH_LUFFA_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for Luffa-224. + */ +#define SPH_SIZE_luffa224 224 + +/** + * Output size (in bits) for Luffa-256. + */ +#define SPH_SIZE_luffa256 256 + +/** + * Output size (in bits) for Luffa-384. + */ +#define SPH_SIZE_luffa384 384 + +/** + * Output size (in bits) for Luffa-512. + */ +#define SPH_SIZE_luffa512 512 + +/** + * This structure is a context for Luffa-224 computations: it contains + * the intermediate values and some data from the last entered block. + * Once a Luffa computation has been performed, the context can be + * reused for another computation. + * + * The contents of this structure are private. A running Luffa + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[32]; /* first field, for alignment */ + size_t ptr; + sph_u32 V[3][8]; +#endif +} sph_luffa224_context; + +/** + * This structure is a context for Luffa-256 computations. It is + * identical to sph_luffa224_context. + */ +typedef sph_luffa224_context sph_luffa256_context; + +/** + * This structure is a context for Luffa-384 computations. + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[32]; /* first field, for alignment */ + size_t ptr; + sph_u32 V[4][8]; +#endif +} sph_luffa384_context; + +/** + * This structure is a context for Luffa-512 computations. + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[32]; /* first field, for alignment */ + size_t ptr; + sph_u32 V[5][8]; +#endif +} sph_luffa512_context; + +/** + * Initialize a Luffa-224 context. This process performs no memory allocation. + * + * @param cc the Luffa-224 context (pointer to a + * sph_luffa224_context) + */ +void sph_luffa224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Luffa-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_luffa224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Luffa-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Luffa-224 context + * @param dst the destination buffer + */ +void sph_luffa224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Luffa-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_luffa224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Luffa-256 context. This process performs no memory allocation. + * + * @param cc the Luffa-256 context (pointer to a + * sph_luffa256_context) + */ +void sph_luffa256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Luffa-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_luffa256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Luffa-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Luffa-256 context + * @param dst the destination buffer + */ +void sph_luffa256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Luffa-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_luffa256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Luffa-384 context. This process performs no memory allocation. + * + * @param cc the Luffa-384 context (pointer to a + * sph_luffa384_context) + */ +void sph_luffa384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Luffa-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_luffa384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Luffa-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Luffa-384 context + * @param dst the destination buffer + */ +void sph_luffa384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Luffa-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_luffa384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Luffa-512 context. This process performs no memory allocation. + * + * @param cc the Luffa-512 context (pointer to a + * sph_luffa512_context) + */ +void sph_luffa512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Luffa-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_luffa512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Luffa-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Luffa-512 context + * @param dst the destination buffer + */ +void sph_luffa512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Luffa-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_luffa512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_shavite.c b/sha3/sph_shavite.c new file mode 100644 index 00000000..85074f33 --- /dev/null +++ b/sha3/sph_shavite.c @@ -0,0 +1,1764 @@ +/* $Id: shavite.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * SHAvite-3 implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_shavite.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHAVITE +#define SPH_SMALL_FOOTPRINT_SHAVITE 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#define C32 SPH_C32 + +/* + * As of round 2 of the SHA-3 competition, the published reference + * implementation and test vectors are wrong, because they use + * big-endian AES tables while the internal decoding uses little-endian. + * The code below follows the specification. To turn it into a code + * which follows the reference implementation (the one called "BugFix" + * on the SHAvite-3 web site, published on Nov 23rd, 2009), comment out + * the code below (from the '#define AES_BIG_ENDIAN...' to the definition + * of the AES_ROUND_NOKEY macro) and replace it with the version which + * is commented out afterwards. + */ + +#define AES_BIG_ENDIAN 0 +#include "aes_helper.c" + +static const sph_u32 IV224[] = { + C32(0x6774F31C), C32(0x990AE210), C32(0xC87D4274), C32(0xC9546371), + C32(0x62B2AEA8), C32(0x4B5801D8), C32(0x1B702860), C32(0x842F3017) +}; + +static const sph_u32 IV256[] = { + C32(0x49BB3E47), C32(0x2674860D), C32(0xA8B392AC), C32(0x021AC4E6), + C32(0x409283CF), C32(0x620E5D86), C32(0x6D929DCB), C32(0x96CC2A8B) +}; + +static const sph_u32 IV384[] = { + C32(0x83DF1545), C32(0xF9AAEC13), C32(0xF4803CB0), C32(0x11FE1F47), + C32(0xDA6CD269), C32(0x4F53FCD7), C32(0x950529A2), C32(0x97908147), + C32(0xB0A4D7AF), C32(0x2B9132BF), C32(0x226E607D), C32(0x3C0F8D7C), + C32(0x487B3F0F), C32(0x04363E22), C32(0x0155C99C), C32(0xEC2E20D3) +}; + +static const sph_u32 IV512[] = { + C32(0x72FCCDD8), C32(0x79CA4727), C32(0x128A077B), C32(0x40D55AEC), + C32(0xD1901A06), C32(0x430AE307), C32(0xB29F5CD1), C32(0xDF07FBFC), + C32(0x8E45D73D), C32(0x681AB538), C32(0xBDE86578), C32(0xDD577E47), + C32(0xE275EADE), C32(0x502D9FCD), C32(0xB9357178), C32(0x022A4B9A) +}; + +#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \ + sph_u32 t0 = (x0); \ + sph_u32 t1 = (x1); \ + sph_u32 t2 = (x2); \ + sph_u32 t3 = (x3); \ + AES_ROUND_NOKEY_LE(t0, t1, t2, t3, x0, x1, x2, x3); \ + } while (0) + +/* + * This is the code needed to match the "reference implementation" as + * published on Nov 23rd, 2009, instead of the published specification. + * + +#define AES_BIG_ENDIAN 1 +#include "aes_helper.c" + +static const sph_u32 IV224[] = { + C32(0xC4C67795), C32(0xC0B1817F), C32(0xEAD88924), C32(0x1ABB1BB0), + C32(0xE0C29152), C32(0xBDE046BA), C32(0xAEEECF99), C32(0x58D509D8) +}; + +static const sph_u32 IV256[] = { + C32(0x3EECF551), C32(0xBF10819B), C32(0xE6DC8559), C32(0xF3E23FD5), + C32(0x431AEC73), C32(0x79E3F731), C32(0x98325F05), C32(0xA92A31F1) +}; + +static const sph_u32 IV384[] = { + C32(0x71F48510), C32(0xA903A8AC), C32(0xFE3216DD), C32(0x0B2D2AD4), + C32(0x6672900A), C32(0x41032819), C32(0x15A7D780), C32(0xB3CAB8D9), + C32(0x34EF4711), C32(0xDE019FE8), C32(0x4D674DC4), C32(0xE056D96B), + C32(0xA35C016B), C32(0xDD903BA7), C32(0x8C1B09B4), C32(0x2C3E9F25) +}; + +static const sph_u32 IV512[] = { + C32(0xD5652B63), C32(0x25F1E6EA), C32(0xB18F48FA), C32(0xA1EE3A47), + C32(0xC8B67B07), C32(0xBDCE48D3), C32(0xE3937B78), C32(0x05DB5186), + C32(0x613BE326), C32(0xA11FA303), C32(0x90C833D4), C32(0x79CEE316), + C32(0x1E1AF00F), C32(0x2829B165), C32(0x23B25F80), C32(0x21E11499) +}; + +#define AES_ROUND_NOKEY(x0, x1, x2, x3) do { \ + sph_u32 t0 = (x0); \ + sph_u32 t1 = (x1); \ + sph_u32 t2 = (x2); \ + sph_u32 t3 = (x3); \ + AES_ROUND_NOKEY_BE(t0, t1, t2, t3, x0, x1, x2, x3); \ + } while (0) + + */ + +#define KEY_EXPAND_ELT(k0, k1, k2, k3) do { \ + sph_u32 kt; \ + AES_ROUND_NOKEY(k1, k2, k3, k0); \ + kt = (k0); \ + (k0) = (k1); \ + (k1) = (k2); \ + (k2) = (k3); \ + (k3) = kt; \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_SHAVITE + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c256(sph_shavite_small_context *sc, const void *msg) +{ + sph_u32 p0, p1, p2, p3, p4, p5, p6, p7; + sph_u32 rk[144]; + size_t u; + int r, s; + +#if SPH_LITTLE_ENDIAN + memcpy(rk, msg, 64); +#else + for (u = 0; u < 16; u += 4) { + rk[u + 0] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 0); + rk[u + 1] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 4); + rk[u + 2] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 8); + rk[u + 3] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 12); + } +#endif + u = 16; + for (r = 0; r < 4; r ++) { + for (s = 0; s < 2; s ++) { + sph_u32 x0, x1, x2, x3; + + x0 = rk[u - 15]; + x1 = rk[u - 14]; + x2 = rk[u - 13]; + x3 = rk[u - 16]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 16) { + rk[ 16] ^= sc->count0; + rk[ 17] ^= SPH_T32(~sc->count1); + } else if (u == 56) { + rk[ 57] ^= sc->count1; + rk[ 58] ^= SPH_T32(~sc->count0); + } + u += 4; + + x0 = rk[u - 15]; + x1 = rk[u - 14]; + x2 = rk[u - 13]; + x3 = rk[u - 16]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 84) { + rk[ 86] ^= sc->count1; + rk[ 87] ^= SPH_T32(~sc->count0); + } else if (u == 124) { + rk[124] ^= sc->count0; + rk[127] ^= SPH_T32(~sc->count1); + } + u += 4; + } + for (s = 0; s < 4; s ++) { + rk[u + 0] = rk[u - 16] ^ rk[u - 3]; + rk[u + 1] = rk[u - 15] ^ rk[u - 2]; + rk[u + 2] = rk[u - 14] ^ rk[u - 1]; + rk[u + 3] = rk[u - 13] ^ rk[u - 0]; + u += 4; + } + } + + p0 = sc->h[0x0]; + p1 = sc->h[0x1]; + p2 = sc->h[0x2]; + p3 = sc->h[0x3]; + p4 = sc->h[0x4]; + p5 = sc->h[0x5]; + p6 = sc->h[0x6]; + p7 = sc->h[0x7]; + u = 0; + for (r = 0; r < 6; r ++) { + sph_u32 x0, x1, x2, x3; + + x0 = p4 ^ rk[u ++]; + x1 = p5 ^ rk[u ++]; + x2 = p6 ^ rk[u ++]; + x3 = p7 ^ rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + x0 ^= rk[u ++]; + x1 ^= rk[u ++]; + x2 ^= rk[u ++]; + x3 ^= rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + x0 ^= rk[u ++]; + x1 ^= rk[u ++]; + x2 ^= rk[u ++]; + x3 ^= rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + + x0 = p0 ^ rk[u ++]; + x1 = p1 ^ rk[u ++]; + x2 = p2 ^ rk[u ++]; + x3 = p3 ^ rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + x0 ^= rk[u ++]; + x1 ^= rk[u ++]; + x2 ^= rk[u ++]; + x3 ^= rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + x0 ^= rk[u ++]; + x1 ^= rk[u ++]; + x2 ^= rk[u ++]; + x3 ^= rk[u ++]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + } + sc->h[0x0] ^= p0; + sc->h[0x1] ^= p1; + sc->h[0x2] ^= p2; + sc->h[0x3] ^= p3; + sc->h[0x4] ^= p4; + sc->h[0x5] ^= p5; + sc->h[0x6] ^= p6; + sc->h[0x7] ^= p7; +} + +#else + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c256(sph_shavite_small_context *sc, const void *msg) +{ + sph_u32 p0, p1, p2, p3, p4, p5, p6, p7; + sph_u32 x0, x1, x2, x3; + sph_u32 rk0, rk1, rk2, rk3, rk4, rk5, rk6, rk7; + sph_u32 rk8, rk9, rkA, rkB, rkC, rkD, rkE, rkF; + + p0 = sc->h[0x0]; + p1 = sc->h[0x1]; + p2 = sc->h[0x2]; + p3 = sc->h[0x3]; + p4 = sc->h[0x4]; + p5 = sc->h[0x5]; + p6 = sc->h[0x6]; + p7 = sc->h[0x7]; + /* round 0 */ + rk0 = sph_dec32le_aligned((const unsigned char *)msg + 0); + x0 = p4 ^ rk0; + rk1 = sph_dec32le_aligned((const unsigned char *)msg + 4); + x1 = p5 ^ rk1; + rk2 = sph_dec32le_aligned((const unsigned char *)msg + 8); + x2 = p6 ^ rk2; + rk3 = sph_dec32le_aligned((const unsigned char *)msg + 12); + x3 = p7 ^ rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk4 = sph_dec32le_aligned((const unsigned char *)msg + 16); + x0 ^= rk4; + rk5 = sph_dec32le_aligned((const unsigned char *)msg + 20); + x1 ^= rk5; + rk6 = sph_dec32le_aligned((const unsigned char *)msg + 24); + x2 ^= rk6; + rk7 = sph_dec32le_aligned((const unsigned char *)msg + 28); + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk8 = sph_dec32le_aligned((const unsigned char *)msg + 32); + x0 ^= rk8; + rk9 = sph_dec32le_aligned((const unsigned char *)msg + 36); + x1 ^= rk9; + rkA = sph_dec32le_aligned((const unsigned char *)msg + 40); + x2 ^= rkA; + rkB = sph_dec32le_aligned((const unsigned char *)msg + 44); + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 1 */ + rkC = sph_dec32le_aligned((const unsigned char *)msg + 48); + x0 = p0 ^ rkC; + rkD = sph_dec32le_aligned((const unsigned char *)msg + 52); + x1 = p1 ^ rkD; + rkE = sph_dec32le_aligned((const unsigned char *)msg + 56); + x2 = p2 ^ rkE; + rkF = sph_dec32le_aligned((const unsigned char *)msg + 60); + x3 = p3 ^ rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0, rk1, rk2, rk3); + rk0 ^= rkC ^ sc->count0; + rk1 ^= rkD ^ SPH_T32(~sc->count1); + rk2 ^= rkE; + rk3 ^= rkF; + x0 ^= rk0; + x1 ^= rk1; + x2 ^= rk2; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk4, rk5, rk6, rk7); + rk4 ^= rk0; + rk5 ^= rk1; + rk6 ^= rk2; + rk7 ^= rk3; + x0 ^= rk4; + x1 ^= rk5; + x2 ^= rk6; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 2 */ + KEY_EXPAND_ELT(rk8, rk9, rkA, rkB); + rk8 ^= rk4; + rk9 ^= rk5; + rkA ^= rk6; + rkB ^= rk7; + x0 = p4 ^ rk8; + x1 = p5 ^ rk9; + x2 = p6 ^ rkA; + x3 = p7 ^ rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rkC, rkD, rkE, rkF); + rkC ^= rk8; + rkD ^= rk9; + rkE ^= rkA; + rkF ^= rkB; + x0 ^= rkC; + x1 ^= rkD; + x2 ^= rkE; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0 ^= rkD; + x0 ^= rk0; + rk1 ^= rkE; + x1 ^= rk1; + rk2 ^= rkF; + x2 ^= rk2; + rk3 ^= rk0; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 3 */ + rk4 ^= rk1; + x0 = p0 ^ rk4; + rk5 ^= rk2; + x1 = p1 ^ rk5; + rk6 ^= rk3; + x2 = p2 ^ rk6; + rk7 ^= rk4; + x3 = p3 ^ rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk8 ^= rk5; + x0 ^= rk8; + rk9 ^= rk6; + x1 ^= rk9; + rkA ^= rk7; + x2 ^= rkA; + rkB ^= rk8; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rkC ^= rk9; + x0 ^= rkC; + rkD ^= rkA; + x1 ^= rkD; + rkE ^= rkB; + x2 ^= rkE; + rkF ^= rkC; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 4 */ + KEY_EXPAND_ELT(rk0, rk1, rk2, rk3); + rk0 ^= rkC; + rk1 ^= rkD; + rk2 ^= rkE; + rk3 ^= rkF; + x0 = p4 ^ rk0; + x1 = p5 ^ rk1; + x2 = p6 ^ rk2; + x3 = p7 ^ rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk4, rk5, rk6, rk7); + rk4 ^= rk0; + rk5 ^= rk1; + rk6 ^= rk2; + rk7 ^= rk3; + x0 ^= rk4; + x1 ^= rk5; + x2 ^= rk6; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk8, rk9, rkA, rkB); + rk8 ^= rk4; + rk9 ^= rk5 ^ sc->count1; + rkA ^= rk6 ^ SPH_T32(~sc->count0); + rkB ^= rk7; + x0 ^= rk8; + x1 ^= rk9; + x2 ^= rkA; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 5 */ + KEY_EXPAND_ELT(rkC, rkD, rkE, rkF); + rkC ^= rk8; + rkD ^= rk9; + rkE ^= rkA; + rkF ^= rkB; + x0 = p0 ^ rkC; + x1 = p1 ^ rkD; + x2 = p2 ^ rkE; + x3 = p3 ^ rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0 ^= rkD; + x0 ^= rk0; + rk1 ^= rkE; + x1 ^= rk1; + rk2 ^= rkF; + x2 ^= rk2; + rk3 ^= rk0; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk4 ^= rk1; + x0 ^= rk4; + rk5 ^= rk2; + x1 ^= rk5; + rk6 ^= rk3; + x2 ^= rk6; + rk7 ^= rk4; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 6 */ + rk8 ^= rk5; + x0 = p4 ^ rk8; + rk9 ^= rk6; + x1 = p5 ^ rk9; + rkA ^= rk7; + x2 = p6 ^ rkA; + rkB ^= rk8; + x3 = p7 ^ rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rkC ^= rk9; + x0 ^= rkC; + rkD ^= rkA; + x1 ^= rkD; + rkE ^= rkB; + x2 ^= rkE; + rkF ^= rkC; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0, rk1, rk2, rk3); + rk0 ^= rkC; + rk1 ^= rkD; + rk2 ^= rkE; + rk3 ^= rkF; + x0 ^= rk0; + x1 ^= rk1; + x2 ^= rk2; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 7 */ + KEY_EXPAND_ELT(rk4, rk5, rk6, rk7); + rk4 ^= rk0; + rk5 ^= rk1; + rk6 ^= rk2 ^ sc->count1; + rk7 ^= rk3 ^ SPH_T32(~sc->count0); + x0 = p0 ^ rk4; + x1 = p1 ^ rk5; + x2 = p2 ^ rk6; + x3 = p3 ^ rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk8, rk9, rkA, rkB); + rk8 ^= rk4; + rk9 ^= rk5; + rkA ^= rk6; + rkB ^= rk7; + x0 ^= rk8; + x1 ^= rk9; + x2 ^= rkA; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rkC, rkD, rkE, rkF); + rkC ^= rk8; + rkD ^= rk9; + rkE ^= rkA; + rkF ^= rkB; + x0 ^= rkC; + x1 ^= rkD; + x2 ^= rkE; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 8 */ + rk0 ^= rkD; + x0 = p4 ^ rk0; + rk1 ^= rkE; + x1 = p5 ^ rk1; + rk2 ^= rkF; + x2 = p6 ^ rk2; + rk3 ^= rk0; + x3 = p7 ^ rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk4 ^= rk1; + x0 ^= rk4; + rk5 ^= rk2; + x1 ^= rk5; + rk6 ^= rk3; + x2 ^= rk6; + rk7 ^= rk4; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk8 ^= rk5; + x0 ^= rk8; + rk9 ^= rk6; + x1 ^= rk9; + rkA ^= rk7; + x2 ^= rkA; + rkB ^= rk8; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 9 */ + rkC ^= rk9; + x0 = p0 ^ rkC; + rkD ^= rkA; + x1 = p1 ^ rkD; + rkE ^= rkB; + x2 = p2 ^ rkE; + rkF ^= rkC; + x3 = p3 ^ rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0, rk1, rk2, rk3); + rk0 ^= rkC; + rk1 ^= rkD; + rk2 ^= rkE; + rk3 ^= rkF; + x0 ^= rk0; + x1 ^= rk1; + x2 ^= rk2; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk4, rk5, rk6, rk7); + rk4 ^= rk0; + rk5 ^= rk1; + rk6 ^= rk2; + rk7 ^= rk3; + x0 ^= rk4; + x1 ^= rk5; + x2 ^= rk6; + x3 ^= rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 10 */ + KEY_EXPAND_ELT(rk8, rk9, rkA, rkB); + rk8 ^= rk4; + rk9 ^= rk5; + rkA ^= rk6; + rkB ^= rk7; + x0 = p4 ^ rk8; + x1 = p5 ^ rk9; + x2 = p6 ^ rkA; + x3 = p7 ^ rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rkC, rkD, rkE, rkF); + rkC ^= rk8 ^ sc->count0; + rkD ^= rk9; + rkE ^= rkA; + rkF ^= rkB ^ SPH_T32(~sc->count1); + x0 ^= rkC; + x1 ^= rkD; + x2 ^= rkE; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0 ^= rkD; + x0 ^= rk0; + rk1 ^= rkE; + x1 ^= rk1; + rk2 ^= rkF; + x2 ^= rk2; + rk3 ^= rk0; + x3 ^= rk3; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 11 */ + rk4 ^= rk1; + x0 = p0 ^ rk4; + rk5 ^= rk2; + x1 = p1 ^ rk5; + rk6 ^= rk3; + x2 = p2 ^ rk6; + rk7 ^= rk4; + x3 = p3 ^ rk7; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk8 ^= rk5; + x0 ^= rk8; + rk9 ^= rk6; + x1 ^= rk9; + rkA ^= rk7; + x2 ^= rkA; + rkB ^= rk8; + x3 ^= rkB; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rkC ^= rk9; + x0 ^= rkC; + rkD ^= rkA; + x1 ^= rkD; + rkE ^= rkB; + x2 ^= rkE; + rkF ^= rkC; + x3 ^= rkF; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + sc->h[0x0] ^= p0; + sc->h[0x1] ^= p1; + sc->h[0x2] ^= p2; + sc->h[0x3] ^= p3; + sc->h[0x4] ^= p4; + sc->h[0x5] ^= p5; + sc->h[0x6] ^= p6; + sc->h[0x7] ^= p7; +} + +#endif + +#if SPH_SMALL_FOOTPRINT_SHAVITE + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c512(sph_shavite_big_context *sc, const void *msg) +{ + sph_u32 p0, p1, p2, p3, p4, p5, p6, p7; + sph_u32 p8, p9, pA, pB, pC, pD, pE, pF; + sph_u32 rk[448]; + size_t u; + int r, s; + +#if SPH_LITTLE_ENDIAN + memcpy(rk, msg, 128); +#else + for (u = 0; u < 32; u += 4) { + rk[u + 0] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 0); + rk[u + 1] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 4); + rk[u + 2] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 8); + rk[u + 3] = sph_dec32le_aligned( + (const unsigned char *)msg + (u << 2) + 12); + } +#endif + u = 32; + for (;;) { + for (s = 0; s < 4; s ++) { + sph_u32 x0, x1, x2, x3; + + x0 = rk[u - 31]; + x1 = rk[u - 30]; + x2 = rk[u - 29]; + x3 = rk[u - 32]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 32) { + rk[ 32] ^= sc->count0; + rk[ 33] ^= sc->count1; + rk[ 34] ^= sc->count2; + rk[ 35] ^= SPH_T32(~sc->count3); + } else if (u == 440) { + rk[440] ^= sc->count1; + rk[441] ^= sc->count0; + rk[442] ^= sc->count3; + rk[443] ^= SPH_T32(~sc->count2); + } + u += 4; + + x0 = rk[u - 31]; + x1 = rk[u - 30]; + x2 = rk[u - 29]; + x3 = rk[u - 32]; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk[u + 0] = x0 ^ rk[u - 4]; + rk[u + 1] = x1 ^ rk[u - 3]; + rk[u + 2] = x2 ^ rk[u - 2]; + rk[u + 3] = x3 ^ rk[u - 1]; + if (u == 164) { + rk[164] ^= sc->count3; + rk[165] ^= sc->count2; + rk[166] ^= sc->count1; + rk[167] ^= SPH_T32(~sc->count0); + } else if (u == 316) { + rk[316] ^= sc->count2; + rk[317] ^= sc->count3; + rk[318] ^= sc->count0; + rk[319] ^= SPH_T32(~sc->count1); + } + u += 4; + } + if (u == 448) + break; + for (s = 0; s < 8; s ++) { + rk[u + 0] = rk[u - 32] ^ rk[u - 7]; + rk[u + 1] = rk[u - 31] ^ rk[u - 6]; + rk[u + 2] = rk[u - 30] ^ rk[u - 5]; + rk[u + 3] = rk[u - 29] ^ rk[u - 4]; + u += 4; + } + } + + p0 = sc->h[0x0]; + p1 = sc->h[0x1]; + p2 = sc->h[0x2]; + p3 = sc->h[0x3]; + p4 = sc->h[0x4]; + p5 = sc->h[0x5]; + p6 = sc->h[0x6]; + p7 = sc->h[0x7]; + p8 = sc->h[0x8]; + p9 = sc->h[0x9]; + pA = sc->h[0xA]; + pB = sc->h[0xB]; + pC = sc->h[0xC]; + pD = sc->h[0xD]; + pE = sc->h[0xE]; + pF = sc->h[0xF]; + u = 0; + for (r = 0; r < 14; r ++) { +#define C512_ELT(l0, l1, l2, l3, r0, r1, r2, r3) do { \ + sph_u32 x0, x1, x2, x3; \ + x0 = r0 ^ rk[u ++]; \ + x1 = r1 ^ rk[u ++]; \ + x2 = r2 ^ rk[u ++]; \ + x3 = r3 ^ rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + x0 ^= rk[u ++]; \ + x1 ^= rk[u ++]; \ + x2 ^= rk[u ++]; \ + x3 ^= rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + x0 ^= rk[u ++]; \ + x1 ^= rk[u ++]; \ + x2 ^= rk[u ++]; \ + x3 ^= rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + x0 ^= rk[u ++]; \ + x1 ^= rk[u ++]; \ + x2 ^= rk[u ++]; \ + x3 ^= rk[u ++]; \ + AES_ROUND_NOKEY(x0, x1, x2, x3); \ + l0 ^= x0; \ + l1 ^= x1; \ + l2 ^= x2; \ + l3 ^= x3; \ + } while (0) + +#define WROT(a, b, c, d) do { \ + sph_u32 t = d; \ + d = c; \ + c = b; \ + b = a; \ + a = t; \ + } while (0) + + C512_ELT(p0, p1, p2, p3, p4, p5, p6, p7); + C512_ELT(p8, p9, pA, pB, pC, pD, pE, pF); + + WROT(p0, p4, p8, pC); + WROT(p1, p5, p9, pD); + WROT(p2, p6, pA, pE); + WROT(p3, p7, pB, pF); + +#undef C512_ELT +#undef WROT + } + sc->h[0x0] ^= p0; + sc->h[0x1] ^= p1; + sc->h[0x2] ^= p2; + sc->h[0x3] ^= p3; + sc->h[0x4] ^= p4; + sc->h[0x5] ^= p5; + sc->h[0x6] ^= p6; + sc->h[0x7] ^= p7; + sc->h[0x8] ^= p8; + sc->h[0x9] ^= p9; + sc->h[0xA] ^= pA; + sc->h[0xB] ^= pB; + sc->h[0xC] ^= pC; + sc->h[0xD] ^= pD; + sc->h[0xE] ^= pE; + sc->h[0xF] ^= pF; +} + +#else + +/* + * This function assumes that "msg" is aligned for 32-bit access. + */ +static void +c512(sph_shavite_big_context *sc, const void *msg) +{ + sph_u32 p0, p1, p2, p3, p4, p5, p6, p7; + sph_u32 p8, p9, pA, pB, pC, pD, pE, pF; + sph_u32 x0, x1, x2, x3; + sph_u32 rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; + sph_u32 rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; + sph_u32 rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; + sph_u32 rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; + int r; + + p0 = sc->h[0x0]; + p1 = sc->h[0x1]; + p2 = sc->h[0x2]; + p3 = sc->h[0x3]; + p4 = sc->h[0x4]; + p5 = sc->h[0x5]; + p6 = sc->h[0x6]; + p7 = sc->h[0x7]; + p8 = sc->h[0x8]; + p9 = sc->h[0x9]; + pA = sc->h[0xA]; + pB = sc->h[0xB]; + pC = sc->h[0xC]; + pD = sc->h[0xD]; + pE = sc->h[0xE]; + pF = sc->h[0xF]; + /* round 0 */ + rk00 = sph_dec32le_aligned((const unsigned char *)msg + 0); + x0 = p4 ^ rk00; + rk01 = sph_dec32le_aligned((const unsigned char *)msg + 4); + x1 = p5 ^ rk01; + rk02 = sph_dec32le_aligned((const unsigned char *)msg + 8); + x2 = p6 ^ rk02; + rk03 = sph_dec32le_aligned((const unsigned char *)msg + 12); + x3 = p7 ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk04 = sph_dec32le_aligned((const unsigned char *)msg + 16); + x0 ^= rk04; + rk05 = sph_dec32le_aligned((const unsigned char *)msg + 20); + x1 ^= rk05; + rk06 = sph_dec32le_aligned((const unsigned char *)msg + 24); + x2 ^= rk06; + rk07 = sph_dec32le_aligned((const unsigned char *)msg + 28); + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk08 = sph_dec32le_aligned((const unsigned char *)msg + 32); + x0 ^= rk08; + rk09 = sph_dec32le_aligned((const unsigned char *)msg + 36); + x1 ^= rk09; + rk0A = sph_dec32le_aligned((const unsigned char *)msg + 40); + x2 ^= rk0A; + rk0B = sph_dec32le_aligned((const unsigned char *)msg + 44); + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0C = sph_dec32le_aligned((const unsigned char *)msg + 48); + x0 ^= rk0C; + rk0D = sph_dec32le_aligned((const unsigned char *)msg + 52); + x1 ^= rk0D; + rk0E = sph_dec32le_aligned((const unsigned char *)msg + 56); + x2 ^= rk0E; + rk0F = sph_dec32le_aligned((const unsigned char *)msg + 60); + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk10 = sph_dec32le_aligned((const unsigned char *)msg + 64); + x0 = pC ^ rk10; + rk11 = sph_dec32le_aligned((const unsigned char *)msg + 68); + x1 = pD ^ rk11; + rk12 = sph_dec32le_aligned((const unsigned char *)msg + 72); + x2 = pE ^ rk12; + rk13 = sph_dec32le_aligned((const unsigned char *)msg + 76); + x3 = pF ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk14 = sph_dec32le_aligned((const unsigned char *)msg + 80); + x0 ^= rk14; + rk15 = sph_dec32le_aligned((const unsigned char *)msg + 84); + x1 ^= rk15; + rk16 = sph_dec32le_aligned((const unsigned char *)msg + 88); + x2 ^= rk16; + rk17 = sph_dec32le_aligned((const unsigned char *)msg + 92); + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk18 = sph_dec32le_aligned((const unsigned char *)msg + 96); + x0 ^= rk18; + rk19 = sph_dec32le_aligned((const unsigned char *)msg + 100); + x1 ^= rk19; + rk1A = sph_dec32le_aligned((const unsigned char *)msg + 104); + x2 ^= rk1A; + rk1B = sph_dec32le_aligned((const unsigned char *)msg + 108); + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk1C = sph_dec32le_aligned((const unsigned char *)msg + 112); + x0 ^= rk1C; + rk1D = sph_dec32le_aligned((const unsigned char *)msg + 116); + x1 ^= rk1D; + rk1E = sph_dec32le_aligned((const unsigned char *)msg + 120); + x2 ^= rk1E; + rk1F = sph_dec32le_aligned((const unsigned char *)msg + 124); + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + for (r = 0; r < 3; r ++) { + /* round 1, 5, 9 */ + KEY_EXPAND_ELT(rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + if (r == 0) { + rk00 ^= sc->count0; + rk01 ^= sc->count1; + rk02 ^= sc->count2; + rk03 ^= SPH_T32(~sc->count3); + } + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + if (r == 1) { + rk04 ^= sc->count3; + rk05 ^= sc->count2; + rk06 ^= sc->count1; + rk07 ^= SPH_T32(~sc->count0); + } + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + if (r == 2) { + rk1C ^= sc->count2; + rk1D ^= sc->count3; + rk1E ^= sc->count0; + rk1F ^= SPH_T32(~sc->count1); + } + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + /* round 2, 6, 10 */ + rk00 ^= rk19; + x0 = pC ^ rk00; + rk01 ^= rk1A; + x1 = pD ^ rk01; + rk02 ^= rk1B; + x2 = pE ^ rk02; + rk03 ^= rk1C; + x3 = pF ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + rk10 ^= rk09; + x0 = p4 ^ rk10; + rk11 ^= rk0A; + x1 = p5 ^ rk11; + rk12 ^= rk0B; + x2 = p6 ^ rk12; + rk13 ^= rk0C; + x3 = p7 ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* round 3, 7, 11 */ + KEY_EXPAND_ELT(rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p8 ^ rk00; + x1 = p9 ^ rk01; + x2 = pA ^ rk02; + x3 = pB ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + KEY_EXPAND_ELT(rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p0 ^ rk10; + x1 = p1 ^ rk11; + x2 = p2 ^ rk12; + x3 = p3 ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + /* round 4, 8, 12 */ + rk00 ^= rk19; + x0 = p4 ^ rk00; + rk01 ^= rk1A; + x1 = p5 ^ rk01; + rk02 ^= rk1B; + x2 = p6 ^ rk02; + rk03 ^= rk1C; + x3 = p7 ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk10 ^= rk09; + x0 = pC ^ rk10; + rk11 ^= rk0A; + x1 = pD ^ rk11; + rk12 ^= rk0B; + x2 = pE ^ rk12; + rk13 ^= rk0C; + x3 = pF ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + } + /* round 13 */ + KEY_EXPAND_ELT(rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk18, rk19, rk1A, rk1B); + rk18 ^= rk14 ^ sc->count1; + rk19 ^= rk15 ^ sc->count0; + rk1A ^= rk16 ^ sc->count3; + rk1B ^= rk17 ^ SPH_T32(~sc->count2); + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(x0, x1, x2, x3); + KEY_EXPAND_ELT(rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + sc->h[0x0] ^= p8; + sc->h[0x1] ^= p9; + sc->h[0x2] ^= pA; + sc->h[0x3] ^= pB; + sc->h[0x4] ^= pC; + sc->h[0x5] ^= pD; + sc->h[0x6] ^= pE; + sc->h[0x7] ^= pF; + sc->h[0x8] ^= p0; + sc->h[0x9] ^= p1; + sc->h[0xA] ^= p2; + sc->h[0xB] ^= p3; + sc->h[0xC] ^= p4; + sc->h[0xD] ^= p5; + sc->h[0xE] ^= p6; + sc->h[0xF] ^= p7; +} + +#endif + +static void +shavite_small_init(sph_shavite_small_context *sc, const sph_u32 *iv) +{ + memcpy(sc->h, iv, sizeof sc->h); + sc->ptr = 0; + sc->count0 = 0; + sc->count1 = 0; +} + +static void +shavite_small_core(sph_shavite_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + + buf = sc->buf; + ptr = sc->ptr; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + ptr += clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((sc->count0 = SPH_T32(sc->count0 + 512)) == 0) + sc->count1 = SPH_T32(sc->count1 + 1); + c256(sc, buf); + ptr = 0; + } + } + sc->ptr = ptr; +} + +static void +shavite_small_close(sph_shavite_small_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w32) +{ + unsigned char *buf; + size_t ptr, u; + unsigned z; + sph_u32 count0, count1; + + buf = sc->buf; + ptr = sc->ptr; + count0 = (sc->count0 += (ptr << 3) + n); + count1 = sc->count1; + z = 0x80 >> n; + z = ((ub & -z) | z) & 0xFF; + if (ptr == 0 && n == 0) { + buf[0] = 0x80; + memset(buf + 1, 0, 53); + sc->count0 = sc->count1 = 0; + } else if (ptr < 54) { + buf[ptr ++] = z; + memset(buf + ptr, 0, 54 - ptr); + } else { + buf[ptr ++] = z; + memset(buf + ptr, 0, 64 - ptr); + c256(sc, buf); + memset(buf, 0, 54); + sc->count0 = sc->count1 = 0; + } + sph_enc32le(buf + 54, count0); + sph_enc32le(buf + 58, count1); + buf[62] = out_size_w32 << 5; + buf[63] = out_size_w32 >> 3; + c256(sc, buf); + for (u = 0; u < out_size_w32; u ++) + sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]); +} + +static void +shavite_big_init(sph_shavite_big_context *sc, const sph_u32 *iv) +{ + memcpy(sc->h, iv, sizeof sc->h); + sc->ptr = 0; + sc->count0 = 0; + sc->count1 = 0; + sc->count2 = 0; + sc->count3 = 0; +} + +static void +shavite_big_core(sph_shavite_big_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr; + + buf = sc->buf; + ptr = sc->ptr; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + ptr += clen; + len -= clen; + if (ptr == sizeof sc->buf) { + if ((sc->count0 = SPH_T32(sc->count0 + 1024)) == 0) { + sc->count1 = SPH_T32(sc->count1 + 1); + if (sc->count1 == 0) { + sc->count2 = SPH_T32(sc->count2 + 1); + if (sc->count2 == 0) { + sc->count3 = SPH_T32( + sc->count3 + 1); + } + } + } + c512(sc, buf); + ptr = 0; + } + } + sc->ptr = ptr; +} + +static void +shavite_big_close(sph_shavite_big_context *sc, + unsigned ub, unsigned n, void *dst, size_t out_size_w32) +{ + unsigned char *buf; + size_t ptr, u; + unsigned z; + sph_u32 count0, count1, count2, count3; + + buf = sc->buf; + ptr = sc->ptr; + count0 = (sc->count0 += (ptr << 3) + n); + count1 = sc->count1; + count2 = sc->count2; + count3 = sc->count3; + z = 0x80 >> n; + z = ((ub & -z) | z) & 0xFF; + if (ptr == 0 && n == 0) { + buf[0] = 0x80; + memset(buf + 1, 0, 109); + sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0; + } else if (ptr < 110) { + buf[ptr ++] = z; + memset(buf + ptr, 0, 110 - ptr); + } else { + buf[ptr ++] = z; + memset(buf + ptr, 0, 128 - ptr); + c512(sc, buf); + memset(buf, 0, 110); + sc->count0 = sc->count1 = sc->count2 = sc->count3 = 0; + } + sph_enc32le(buf + 110, count0); + sph_enc32le(buf + 114, count1); + sph_enc32le(buf + 118, count2); + sph_enc32le(buf + 122, count3); + buf[126] = out_size_w32 << 5; + buf[127] = out_size_w32 >> 3; + c512(sc, buf); + for (u = 0; u < out_size_w32; u ++) + sph_enc32le((unsigned char *)dst + (u << 2), sc->h[u]); +} + +/* see sph_shavite.h */ +void +sph_shavite224_init(void *cc) +{ + shavite_small_init(cc, IV224); +} + +/* see sph_shavite.h */ +void +sph_shavite224(void *cc, const void *data, size_t len) +{ + shavite_small_core(cc, data, len); +} + +/* see sph_shavite.h */ +void +sph_shavite224_close(void *cc, void *dst) +{ + shavite_small_close(cc, 0, 0, dst, 7); + shavite_small_init(cc, IV224); +} + +/* see sph_shavite.h */ +void +sph_shavite224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + shavite_small_close(cc, ub, n, dst, 7); + shavite_small_init(cc, IV224); +} + +/* see sph_shavite.h */ +void +sph_shavite256_init(void *cc) +{ + shavite_small_init(cc, IV256); +} + +/* see sph_shavite.h */ +void +sph_shavite256(void *cc, const void *data, size_t len) +{ + shavite_small_core(cc, data, len); +} + +/* see sph_shavite.h */ +void +sph_shavite256_close(void *cc, void *dst) +{ + shavite_small_close(cc, 0, 0, dst, 8); + shavite_small_init(cc, IV256); +} + +/* see sph_shavite.h */ +void +sph_shavite256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + shavite_small_close(cc, ub, n, dst, 8); + shavite_small_init(cc, IV256); +} + +/* see sph_shavite.h */ +void +sph_shavite384_init(void *cc) +{ + shavite_big_init(cc, IV384); +} + +/* see sph_shavite.h */ +void +sph_shavite384(void *cc, const void *data, size_t len) +{ + shavite_big_core(cc, data, len); +} + +/* see sph_shavite.h */ +void +sph_shavite384_close(void *cc, void *dst) +{ + shavite_big_close(cc, 0, 0, dst, 12); + shavite_big_init(cc, IV384); +} + +/* see sph_shavite.h */ +void +sph_shavite384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + shavite_big_close(cc, ub, n, dst, 12); + shavite_big_init(cc, IV384); +} + +/* see sph_shavite.h */ +void +sph_shavite512_init(void *cc) +{ + shavite_big_init(cc, IV512); +} + +/* see sph_shavite.h */ +void +sph_shavite512(void *cc, const void *data, size_t len) +{ + shavite_big_core(cc, data, len); +} + +/* see sph_shavite.h */ +void +sph_shavite512_close(void *cc, void *dst) +{ + shavite_big_close(cc, 0, 0, dst, 16); + shavite_big_init(cc, IV512); +} + +/* see sph_shavite.h */ +void +sph_shavite512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + shavite_big_close(cc, ub, n, dst, 16); + shavite_big_init(cc, IV512); +} + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/sha3/sph_shavite.h b/sha3/sph_shavite.h new file mode 100644 index 00000000..0957e42a --- /dev/null +++ b/sha3/sph_shavite.h @@ -0,0 +1,314 @@ +/* $Id: sph_shavite.h 208 2010-06-02 20:33:00Z tp $ */ +/** + * SHAvite-3 interface. This code implements SHAvite-3 with the + * recommended parameters for SHA-3, with outputs of 224, 256, 384 and + * 512 bits. In the following, we call the function "SHAvite" (without + * the "-3" suffix), thus "SHAvite-224" is "SHAvite-3 with a 224-bit + * output". + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_shavite.h + * @author Thomas Pornin + */ + +#ifndef SPH_SHAVITE_H__ +#define SPH_SHAVITE_H__ + +#include +#include "sph_types.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +/** + * Output size (in bits) for SHAvite-224. + */ +#define SPH_SIZE_shavite224 224 + +/** + * Output size (in bits) for SHAvite-256. + */ +#define SPH_SIZE_shavite256 256 + +/** + * Output size (in bits) for SHAvite-384. + */ +#define SPH_SIZE_shavite384 384 + +/** + * Output size (in bits) for SHAvite-512. + */ +#define SPH_SIZE_shavite512 512 + +/** + * This structure is a context for SHAvite-224 and SHAvite-256 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a SHAvite computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running SHAvite + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u32 h[8]; + sph_u32 count0, count1; +#endif +} sph_shavite_small_context; + +/** + * This structure is a context for SHAvite-224 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_small_context sph_shavite224_context; + +/** + * This structure is a context for SHAvite-256 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_small_context sph_shavite256_context; + +/** + * This structure is a context for SHAvite-384 and SHAvite-512 computations: + * it contains the intermediate values and some data from the last + * entered block. Once a SHAvite computation has been performed, the + * context can be reused for another computation. + * + * The contents of this structure are private. A running SHAvite + * computation can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + sph_u32 h[16]; + sph_u32 count0, count1, count2, count3; +#endif +} sph_shavite_big_context; + +/** + * This structure is a context for SHAvite-384 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_big_context sph_shavite384_context; + +/** + * This structure is a context for SHAvite-512 computations. It is + * identical to the common sph_shavite_small_context. + */ +typedef sph_shavite_big_context sph_shavite512_context; + +/** + * Initialize a SHAvite-224 context. This process performs no memory allocation. + * + * @param cc the SHAvite-224 context (pointer to a + * sph_shavite224_context) + */ +void sph_shavite224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite224(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-224 context + * @param dst the destination buffer + */ +void sph_shavite224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a SHAvite-256 context. This process performs no memory allocation. + * + * @param cc the SHAvite-256 context (pointer to a + * sph_shavite256_context) + */ +void sph_shavite256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite256(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-256 context + * @param dst the destination buffer + */ +void sph_shavite256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a SHAvite-384 context. This process performs no memory allocation. + * + * @param cc the SHAvite-384 context (pointer to a + * sph_shavite384_context) + */ +void sph_shavite384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite384(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-384 context + * @param dst the destination buffer + */ +void sph_shavite384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a SHAvite-512 context. This process performs no memory allocation. + * + * @param cc the SHAvite-512 context (pointer to a + * sph_shavite512_context) + */ +void sph_shavite512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SHAvite-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_shavite512(void *cc, const void *data, size_t len); + +/** + * Terminate the current SHAvite-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the SHAvite-512 context + * @param dst the destination buffer + */ +void sph_shavite512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SHAvite-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_shavite512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_simd.c b/sha3/sph_simd.c new file mode 100644 index 00000000..2c806261 --- /dev/null +++ b/sha3/sph_simd.c @@ -0,0 +1,1799 @@ +/* $Id: simd.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * SIMD implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include + +#include "sph_simd.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SIMD +#define SPH_SMALL_FOOTPRINT_SIMD 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +typedef sph_u32 u32; +typedef sph_s32 s32; +#define C32 SPH_C32 +#define T32 SPH_T32 +#define ROL32 SPH_ROTL32 + +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +/* + * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive. + */ +static const s32 alpha_tab[] = { + 1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130, + 190, 80, 196, 69, 2, 82, 21, 90, 92, 174, 195, 28, + 120, 37, 232, 3, 123, 160, 135, 138, 4, 164, 42, 180, + 184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19, + 8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12, + 235, 126, 26, 38, 16, 142, 168, 206, 222, 107, 18, 224, + 189, 39, 57, 24, 213, 252, 52, 76, 32, 27, 79, 155, + 187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152, + 64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96, + 81, 237, 208, 47, 128, 108, 59, 106, 234, 85, 144, 250, + 227, 55, 199, 192, 162, 217, 159, 94, 256, 216, 118, 212, + 211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188, + 255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254, + 134, 97, 122, 119, 253, 93, 215, 77, 73, 166, 124, 201, + 17, 183, 50, 251, 11, 194, 244, 238, 249, 186, 173, 154, + 146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219, + 241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233, + 44, 5, 205, 181, 225, 230, 178, 102, 70, 43, 221, 66, + 136, 179, 143, 209, 88, 10, 153, 105, 193, 203, 99, 204, + 140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210, + 129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65, + 95, 40, 98, 163 +}; + +/* + * Ranges: + * REDS1: from -32768..98302 to -383..383 + * REDS2: from -2^31..2^31-1 to -32768..98302 + */ +#define REDS1(x) (((x) & 0xFF) - ((x) >> 8)) +#define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16)) + +/* + * If, upon entry, the values of q[] are all in the -N..N range (where + * N >= 98302) then the new values of q[] are in the -2N..2N range. + * + * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608. + */ +#define FFT_LOOP(rb, hk, as, id) do { \ + size_t u, v; \ + s32 m = q[(rb)]; \ + s32 n = q[(rb) + (hk)]; \ + q[(rb)] = m + n; \ + q[(rb) + (hk)] = m - n; \ + u = v = 0; \ + goto id; \ + for (; u < (hk); u += 4, v += 4 * (as)) { \ + s32 t; \ + m = q[(rb) + u + 0]; \ + n = q[(rb) + u + 0 + (hk)]; \ + t = REDS2(n * alpha_tab[v + 0 * (as)]); \ + q[(rb) + u + 0] = m + t; \ + q[(rb) + u + 0 + (hk)] = m - t; \ + id: \ + m = q[(rb) + u + 1]; \ + n = q[(rb) + u + 1 + (hk)]; \ + t = REDS2(n * alpha_tab[v + 1 * (as)]); \ + q[(rb) + u + 1] = m + t; \ + q[(rb) + u + 1 + (hk)] = m - t; \ + m = q[(rb) + u + 2]; \ + n = q[(rb) + u + 2 + (hk)]; \ + t = REDS2(n * alpha_tab[v + 2 * (as)]); \ + q[(rb) + u + 2] = m + t; \ + q[(rb) + u + 2 + (hk)] = m - t; \ + m = q[(rb) + u + 3]; \ + n = q[(rb) + u + 3 + (hk)]; \ + t = REDS2(n * alpha_tab[v + 3 * (as)]); \ + q[(rb) + u + 3] = m + t; \ + q[(rb) + u + 3 + (hk)] = m - t; \ + } \ + } while (0) + +/* + * Output ranges: + * d0: min= 0 max= 1020 + * d1: min= -67 max= 4587 + * d2: min=-4335 max= 4335 + * d3: min=-4147 max= 507 + * d4: min= -510 max= 510 + * d5: min= -252 max= 4402 + * d6: min=-4335 max= 4335 + * d7: min=-4332 max= 322 + */ +#define FFT8(xb, xs, d) do { \ + s32 x0 = x[(xb)]; \ + s32 x1 = x[(xb) + (xs)]; \ + s32 x2 = x[(xb) + 2 * (xs)]; \ + s32 x3 = x[(xb) + 3 * (xs)]; \ + s32 a0 = x0 + x2; \ + s32 a1 = x0 + (x2 << 4); \ + s32 a2 = x0 - x2; \ + s32 a3 = x0 - (x2 << 4); \ + s32 b0 = x1 + x3; \ + s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \ + s32 b2 = (x1 << 4) - (x3 << 4); \ + s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \ + d ## 0 = a0 + b0; \ + d ## 1 = a1 + b1; \ + d ## 2 = a2 + b2; \ + d ## 3 = a3 + b3; \ + d ## 4 = a0 - b0; \ + d ## 5 = a1 - b1; \ + d ## 6 = a2 - b2; \ + d ## 7 = a3 - b3; \ + } while (0) + +/* + * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced + * to some shifting. + * + * Output: within -591471..591723 + */ +#define FFT16(xb, xs, rb) do { \ + s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \ + s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \ + FFT8(xb, (xs) << 1, d1_); \ + FFT8((xb) + (xs), (xs) << 1, d2_); \ + q[(rb) + 0] = d1_0 + d2_0; \ + q[(rb) + 1] = d1_1 + (d2_1 << 1); \ + q[(rb) + 2] = d1_2 + (d2_2 << 2); \ + q[(rb) + 3] = d1_3 + (d2_3 << 3); \ + q[(rb) + 4] = d1_4 + (d2_4 << 4); \ + q[(rb) + 5] = d1_5 + (d2_5 << 5); \ + q[(rb) + 6] = d1_6 + (d2_6 << 6); \ + q[(rb) + 7] = d1_7 + (d2_7 << 7); \ + q[(rb) + 8] = d1_0 - d2_0; \ + q[(rb) + 9] = d1_1 - (d2_1 << 1); \ + q[(rb) + 10] = d1_2 - (d2_2 << 2); \ + q[(rb) + 11] = d1_3 - (d2_3 << 3); \ + q[(rb) + 12] = d1_4 - (d2_4 << 4); \ + q[(rb) + 13] = d1_5 - (d2_5 << 5); \ + q[(rb) + 14] = d1_6 - (d2_6 << 6); \ + q[(rb) + 15] = d1_7 - (d2_7 << 7); \ + } while (0) + +/* + * Output range: |q| <= 1183446 + */ +#define FFT32(xb, xs, rb, id) do { \ + FFT16(xb, (xs) << 1, rb); \ + FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \ + FFT_LOOP(rb, 16, 8, id); \ + } while (0) + +/* + * Output range: |q| <= 2366892 + */ +#define FFT64(xb, xs, rb, id) do { \ + FFT32(xb, (xs) << 1, rb, XCAT(id, a)); \ + FFT32((xb) + (xs), (xs) << 1, (rb) + 32, XCAT(id, b)); \ + FFT_LOOP(rb, 32, 4, id); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_SIMD + +static void +fft32(unsigned char *x, size_t xs, s32 *q) +{ + size_t xd; + + xd = xs << 1; + FFT16(0, xd, 0); + FFT16(xs, xd, 16); + FFT_LOOP(0, 16, 8, label_); +} + +#define FFT128(xb, xs, rb, id) do { \ + fft32(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \ + fft32(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 32]); \ + FFT_LOOP(rb, 32, 4, XCAT(id, aa)); \ + fft32(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 64]); \ + fft32(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 96]); \ + FFT_LOOP((rb) + 64, 32, 4, XCAT(id, ab)); \ + FFT_LOOP(rb, 64, 2, XCAT(id, a)); \ + } while (0) + +#else + +/* + * Output range: |q| <= 4733784 + */ +#define FFT128(xb, xs, rb, id) do { \ + FFT64(xb, (xs) << 1, rb, XCAT(id, a)); \ + FFT64((xb) + (xs), (xs) << 1, (rb) + 64, XCAT(id, b)); \ + FFT_LOOP(rb, 64, 2, id); \ + } while (0) + +#endif + +/* + * For SIMD-384 / SIMD-512, the fully unrolled FFT yields a compression + * function which does not fit in the 32 kB L1 cache of a typical x86 + * Intel. We therefore add a function call layer at the FFT64 level. + */ + +static void +fft64(unsigned char *x, size_t xs, s32 *q) +{ + size_t xd; + + xd = xs << 1; + FFT32(0, xd, 0, label_a); + FFT32(xs, xd, 32, label_b); + FFT_LOOP(0, 32, 4, label_); +} + +/* + * Output range: |q| <= 9467568 + */ +#define FFT256(xb, xs, rb, id) do { \ + fft64(x + (xb) + ((xs) * 0), (xs) << 2, &q[(rb) + 0]); \ + fft64(x + (xb) + ((xs) * 2), (xs) << 2, &q[(rb) + 64]); \ + FFT_LOOP(rb, 64, 2, XCAT(id, aa)); \ + fft64(x + (xb) + ((xs) * 1), (xs) << 2, &q[(rb) + 128]); \ + fft64(x + (xb) + ((xs) * 3), (xs) << 2, &q[(rb) + 192]); \ + FFT_LOOP((rb) + 128, 64, 2, XCAT(id, ab)); \ + FFT_LOOP(rb, 128, 1, XCAT(id, a)); \ + } while (0) + +/* + * alpha^(127*i) mod 257 + */ +static const unsigned short yoff_s_n[] = { + 1, 98, 95, 58, 30, 113, 23, 198, 129, 49, 176, 29, + 15, 185, 140, 99, 193, 153, 88, 143, 136, 221, 70, 178, + 225, 205, 44, 200, 68, 239, 35, 89, 241, 231, 22, 100, + 34, 248, 146, 173, 249, 244, 11, 50, 17, 124, 73, 215, + 253, 122, 134, 25, 137, 62, 165, 236, 255, 61, 67, 141, + 197, 31, 211, 118, 256, 159, 162, 199, 227, 144, 234, 59, + 128, 208, 81, 228, 242, 72, 117, 158, 64, 104, 169, 114, + 121, 36, 187, 79, 32, 52, 213, 57, 189, 18, 222, 168, + 16, 26, 235, 157, 223, 9, 111, 84, 8, 13, 246, 207, + 240, 133, 184, 42, 4, 135, 123, 232, 120, 195, 92, 21, + 2, 196, 190, 116, 60, 226, 46, 139 +}; + +/* + * alpha^(127*i) + alpha^(125*i) mod 257 + */ +static const unsigned short yoff_s_f[] = { + 2, 156, 118, 107, 45, 212, 111, 162, 97, 249, 211, 3, + 49, 101, 151, 223, 189, 178, 253, 204, 76, 82, 232, 65, + 96, 176, 161, 47, 189, 61, 248, 107, 0, 131, 133, 113, + 17, 33, 12, 111, 251, 103, 57, 148, 47, 65, 249, 143, + 189, 8, 204, 230, 205, 151, 187, 227, 247, 111, 140, 6, + 77, 10, 21, 149, 255, 101, 139, 150, 212, 45, 146, 95, + 160, 8, 46, 254, 208, 156, 106, 34, 68, 79, 4, 53, + 181, 175, 25, 192, 161, 81, 96, 210, 68, 196, 9, 150, + 0, 126, 124, 144, 240, 224, 245, 146, 6, 154, 200, 109, + 210, 192, 8, 114, 68, 249, 53, 27, 52, 106, 70, 30, + 10, 146, 117, 251, 180, 247, 236, 108 +}; + +/* + * beta^(255*i) mod 257 + */ +static const unsigned short yoff_b_n[] = { + 1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172, + 23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101, + 15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10, + 88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230, + 225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150, + 35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109, + 34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194, + 11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93, + 253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83, + 165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110, + 197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217, + 162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108, + 128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171, + 117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78, + 121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252, + 213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142, + 16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182, + 111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74, + 240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160, + 123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82, + 2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87, + 46, 45, 139, 41 +}; + +/* + * beta^(255*i) + beta^(253*i) mod 257 + */ +static const unsigned short yoff_b_f[] = { + 2, 203, 156, 47, 118, 214, 107, 106, 45, 93, 212, 20, + 111, 73, 162, 251, 97, 215, 249, 53, 211, 19, 3, 89, + 49, 207, 101, 67, 151, 130, 223, 23, 189, 202, 178, 239, + 253, 127, 204, 49, 76, 236, 82, 137, 232, 157, 65, 79, + 96, 161, 176, 130, 161, 30, 47, 9, 189, 247, 61, 226, + 248, 90, 107, 64, 0, 88, 131, 243, 133, 59, 113, 115, + 17, 236, 33, 213, 12, 191, 111, 19, 251, 61, 103, 208, + 57, 35, 148, 248, 47, 116, 65, 119, 249, 178, 143, 40, + 189, 129, 8, 163, 204, 227, 230, 196, 205, 122, 151, 45, + 187, 19, 227, 72, 247, 125, 111, 121, 140, 220, 6, 107, + 77, 69, 10, 101, 21, 65, 149, 171, 255, 54, 101, 210, + 139, 43, 150, 151, 212, 164, 45, 237, 146, 184, 95, 6, + 160, 42, 8, 204, 46, 238, 254, 168, 208, 50, 156, 190, + 106, 127, 34, 234, 68, 55, 79, 18, 4, 130, 53, 208, + 181, 21, 175, 120, 25, 100, 192, 178, 161, 96, 81, 127, + 96, 227, 210, 248, 68, 10, 196, 31, 9, 167, 150, 193, + 0, 169, 126, 14, 124, 198, 144, 142, 240, 21, 224, 44, + 245, 66, 146, 238, 6, 196, 154, 49, 200, 222, 109, 9, + 210, 141, 192, 138, 8, 79, 114, 217, 68, 128, 249, 94, + 53, 30, 27, 61, 52, 135, 106, 212, 70, 238, 30, 185, + 10, 132, 146, 136, 117, 37, 251, 150, 180, 188, 247, 156, + 236, 192, 108, 86 +}; + +#define INNER(l, h, mm) (((u32)((l) * (mm)) & 0xFFFFU) \ + + ((u32)((h) * (mm)) << 16)) + +#define W_SMALL(sb, o1, o2, mm) \ + (INNER(q[8 * (sb) + 2 * 0 + o1], q[8 * (sb) + 2 * 0 + o2], mm), \ + INNER(q[8 * (sb) + 2 * 1 + o1], q[8 * (sb) + 2 * 1 + o2], mm), \ + INNER(q[8 * (sb) + 2 * 2 + o1], q[8 * (sb) + 2 * 2 + o2], mm), \ + INNER(q[8 * (sb) + 2 * 3 + o1], q[8 * (sb) + 2 * 3 + o2], mm) + +#define WS_0_0 W_SMALL( 4, 0, 1, 185) +#define WS_0_1 W_SMALL( 6, 0, 1, 185) +#define WS_0_2 W_SMALL( 0, 0, 1, 185) +#define WS_0_3 W_SMALL( 2, 0, 1, 185) +#define WS_0_4 W_SMALL( 7, 0, 1, 185) +#define WS_0_5 W_SMALL( 5, 0, 1, 185) +#define WS_0_6 W_SMALL( 3, 0, 1, 185) +#define WS_0_7 W_SMALL( 1, 0, 1, 185) +#define WS_1_0 W_SMALL(15, 0, 1, 185) +#define WS_1_1 W_SMALL(11, 0, 1, 185) +#define WS_1_2 W_SMALL(12, 0, 1, 185) +#define WS_1_3 W_SMALL( 8, 0, 1, 185) +#define WS_1_4 W_SMALL( 9, 0, 1, 185) +#define WS_1_5 W_SMALL(13, 0, 1, 185) +#define WS_1_6 W_SMALL(10, 0, 1, 185) +#define WS_1_7 W_SMALL(14, 0, 1, 185) +#define WS_2_0 W_SMALL(17, -128, -64, 233) +#define WS_2_1 W_SMALL(18, -128, -64, 233) +#define WS_2_2 W_SMALL(23, -128, -64, 233) +#define WS_2_3 W_SMALL(20, -128, -64, 233) +#define WS_2_4 W_SMALL(22, -128, -64, 233) +#define WS_2_5 W_SMALL(21, -128, -64, 233) +#define WS_2_6 W_SMALL(16, -128, -64, 233) +#define WS_2_7 W_SMALL(19, -128, -64, 233) +#define WS_3_0 W_SMALL(30, -191, -127, 233) +#define WS_3_1 W_SMALL(24, -191, -127, 233) +#define WS_3_2 W_SMALL(25, -191, -127, 233) +#define WS_3_3 W_SMALL(31, -191, -127, 233) +#define WS_3_4 W_SMALL(27, -191, -127, 233) +#define WS_3_5 W_SMALL(29, -191, -127, 233) +#define WS_3_6 W_SMALL(28, -191, -127, 233) +#define WS_3_7 W_SMALL(26, -191, -127, 233) + +#define W_BIG(sb, o1, o2, mm) \ + (INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm) + +#define WB_0_0 W_BIG( 4, 0, 1, 185) +#define WB_0_1 W_BIG( 6, 0, 1, 185) +#define WB_0_2 W_BIG( 0, 0, 1, 185) +#define WB_0_3 W_BIG( 2, 0, 1, 185) +#define WB_0_4 W_BIG( 7, 0, 1, 185) +#define WB_0_5 W_BIG( 5, 0, 1, 185) +#define WB_0_6 W_BIG( 3, 0, 1, 185) +#define WB_0_7 W_BIG( 1, 0, 1, 185) +#define WB_1_0 W_BIG(15, 0, 1, 185) +#define WB_1_1 W_BIG(11, 0, 1, 185) +#define WB_1_2 W_BIG(12, 0, 1, 185) +#define WB_1_3 W_BIG( 8, 0, 1, 185) +#define WB_1_4 W_BIG( 9, 0, 1, 185) +#define WB_1_5 W_BIG(13, 0, 1, 185) +#define WB_1_6 W_BIG(10, 0, 1, 185) +#define WB_1_7 W_BIG(14, 0, 1, 185) +#define WB_2_0 W_BIG(17, -256, -128, 233) +#define WB_2_1 W_BIG(18, -256, -128, 233) +#define WB_2_2 W_BIG(23, -256, -128, 233) +#define WB_2_3 W_BIG(20, -256, -128, 233) +#define WB_2_4 W_BIG(22, -256, -128, 233) +#define WB_2_5 W_BIG(21, -256, -128, 233) +#define WB_2_6 W_BIG(16, -256, -128, 233) +#define WB_2_7 W_BIG(19, -256, -128, 233) +#define WB_3_0 W_BIG(30, -383, -255, 233) +#define WB_3_1 W_BIG(24, -383, -255, 233) +#define WB_3_2 W_BIG(25, -383, -255, 233) +#define WB_3_3 W_BIG(31, -383, -255, 233) +#define WB_3_4 W_BIG(27, -383, -255, 233) +#define WB_3_5 W_BIG(29, -383, -255, 233) +#define WB_3_6 W_BIG(28, -383, -255, 233) +#define WB_3_7 W_BIG(26, -383, -255, 233) + +#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) +#define MAJ(x, y, z) (((x) & (y)) | (((x) | (y)) & (z))) + +#define PP4_0_0 1 +#define PP4_0_1 0 +#define PP4_0_2 3 +#define PP4_0_3 2 +#define PP4_1_0 2 +#define PP4_1_1 3 +#define PP4_1_2 0 +#define PP4_1_3 1 +#define PP4_2_0 3 +#define PP4_2_1 2 +#define PP4_2_2 1 +#define PP4_2_3 0 + +#define PP8_0_0 1 +#define PP8_0_1 0 +#define PP8_0_2 3 +#define PP8_0_3 2 +#define PP8_0_4 5 +#define PP8_0_5 4 +#define PP8_0_6 7 +#define PP8_0_7 6 + +#define PP8_1_0 6 +#define PP8_1_1 7 +#define PP8_1_2 4 +#define PP8_1_3 5 +#define PP8_1_4 2 +#define PP8_1_5 3 +#define PP8_1_6 0 +#define PP8_1_7 1 + +#define PP8_2_0 2 +#define PP8_2_1 3 +#define PP8_2_2 0 +#define PP8_2_3 1 +#define PP8_2_4 6 +#define PP8_2_5 7 +#define PP8_2_6 4 +#define PP8_2_7 5 + +#define PP8_3_0 3 +#define PP8_3_1 2 +#define PP8_3_2 1 +#define PP8_3_3 0 +#define PP8_3_4 7 +#define PP8_3_5 6 +#define PP8_3_6 5 +#define PP8_3_7 4 + +#define PP8_4_0 5 +#define PP8_4_1 4 +#define PP8_4_2 7 +#define PP8_4_3 6 +#define PP8_4_4 1 +#define PP8_4_5 0 +#define PP8_4_6 3 +#define PP8_4_7 2 + +#define PP8_5_0 7 +#define PP8_5_1 6 +#define PP8_5_2 5 +#define PP8_5_3 4 +#define PP8_5_4 3 +#define PP8_5_5 2 +#define PP8_5_6 1 +#define PP8_5_7 0 + +#define PP8_6_0 4 +#define PP8_6_1 5 +#define PP8_6_2 6 +#define PP8_6_3 7 +#define PP8_6_4 0 +#define PP8_6_5 1 +#define PP8_6_6 2 +#define PP8_6_7 3 + +#if SPH_SIMD_NOCOPY + +#define DECL_STATE_SMALL +#define READ_STATE_SMALL(sc) +#define WRITE_STATE_SMALL(sc) +#define DECL_STATE_BIG +#define READ_STATE_BIG(sc) +#define WRITE_STATE_BIG(sc) + +#else + +#define DECL_STATE_SMALL \ + u32 A0, A1, A2, A3, B0, B1, B2, B3, C0, C1, C2, C3, D0, D1, D2, D3; + +#define READ_STATE_SMALL(sc) do { \ + A0 = (sc)->state[ 0]; \ + A1 = (sc)->state[ 1]; \ + A2 = (sc)->state[ 2]; \ + A3 = (sc)->state[ 3]; \ + B0 = (sc)->state[ 4]; \ + B1 = (sc)->state[ 5]; \ + B2 = (sc)->state[ 6]; \ + B3 = (sc)->state[ 7]; \ + C0 = (sc)->state[ 8]; \ + C1 = (sc)->state[ 9]; \ + C2 = (sc)->state[10]; \ + C3 = (sc)->state[11]; \ + D0 = (sc)->state[12]; \ + D1 = (sc)->state[13]; \ + D2 = (sc)->state[14]; \ + D3 = (sc)->state[15]; \ + } while (0) + +#define WRITE_STATE_SMALL(sc) do { \ + (sc)->state[ 0] = A0; \ + (sc)->state[ 1] = A1; \ + (sc)->state[ 2] = A2; \ + (sc)->state[ 3] = A3; \ + (sc)->state[ 4] = B0; \ + (sc)->state[ 5] = B1; \ + (sc)->state[ 6] = B2; \ + (sc)->state[ 7] = B3; \ + (sc)->state[ 8] = C0; \ + (sc)->state[ 9] = C1; \ + (sc)->state[10] = C2; \ + (sc)->state[11] = C3; \ + (sc)->state[12] = D0; \ + (sc)->state[13] = D1; \ + (sc)->state[14] = D2; \ + (sc)->state[15] = D3; \ + } while (0) + +#define DECL_STATE_BIG \ + u32 A0, A1, A2, A3, A4, A5, A6, A7; \ + u32 B0, B1, B2, B3, B4, B5, B6, B7; \ + u32 C0, C1, C2, C3, C4, C5, C6, C7; \ + u32 D0, D1, D2, D3, D4, D5, D6, D7; + +#define READ_STATE_BIG(sc) do { \ + A0 = (sc)->state[ 0]; \ + A1 = (sc)->state[ 1]; \ + A2 = (sc)->state[ 2]; \ + A3 = (sc)->state[ 3]; \ + A4 = (sc)->state[ 4]; \ + A5 = (sc)->state[ 5]; \ + A6 = (sc)->state[ 6]; \ + A7 = (sc)->state[ 7]; \ + B0 = (sc)->state[ 8]; \ + B1 = (sc)->state[ 9]; \ + B2 = (sc)->state[10]; \ + B3 = (sc)->state[11]; \ + B4 = (sc)->state[12]; \ + B5 = (sc)->state[13]; \ + B6 = (sc)->state[14]; \ + B7 = (sc)->state[15]; \ + C0 = (sc)->state[16]; \ + C1 = (sc)->state[17]; \ + C2 = (sc)->state[18]; \ + C3 = (sc)->state[19]; \ + C4 = (sc)->state[20]; \ + C5 = (sc)->state[21]; \ + C6 = (sc)->state[22]; \ + C7 = (sc)->state[23]; \ + D0 = (sc)->state[24]; \ + D1 = (sc)->state[25]; \ + D2 = (sc)->state[26]; \ + D3 = (sc)->state[27]; \ + D4 = (sc)->state[28]; \ + D5 = (sc)->state[29]; \ + D6 = (sc)->state[30]; \ + D7 = (sc)->state[31]; \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + (sc)->state[ 0] = A0; \ + (sc)->state[ 1] = A1; \ + (sc)->state[ 2] = A2; \ + (sc)->state[ 3] = A3; \ + (sc)->state[ 4] = A4; \ + (sc)->state[ 5] = A5; \ + (sc)->state[ 6] = A6; \ + (sc)->state[ 7] = A7; \ + (sc)->state[ 8] = B0; \ + (sc)->state[ 9] = B1; \ + (sc)->state[10] = B2; \ + (sc)->state[11] = B3; \ + (sc)->state[12] = B4; \ + (sc)->state[13] = B5; \ + (sc)->state[14] = B6; \ + (sc)->state[15] = B7; \ + (sc)->state[16] = C0; \ + (sc)->state[17] = C1; \ + (sc)->state[18] = C2; \ + (sc)->state[19] = C3; \ + (sc)->state[20] = C4; \ + (sc)->state[21] = C5; \ + (sc)->state[22] = C6; \ + (sc)->state[23] = C7; \ + (sc)->state[24] = D0; \ + (sc)->state[25] = D1; \ + (sc)->state[26] = D2; \ + (sc)->state[27] = D3; \ + (sc)->state[28] = D4; \ + (sc)->state[29] = D5; \ + (sc)->state[30] = D6; \ + (sc)->state[31] = D7; \ + } while (0) + +#endif + +#define STEP_ELT(n, w, fun, s, ppb) do { \ + u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ + A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \ + D ## n = C ## n; \ + C ## n = B ## n; \ + B ## n = tA ## n; \ + } while (0) + +#define STEP_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \ + u32 tA0 = ROL32(A0, r); \ + u32 tA1 = ROL32(A1, r); \ + u32 tA2 = ROL32(A2, r); \ + u32 tA3 = ROL32(A3, r); \ + STEP_ELT(0, w0, fun, s, pp4b); \ + STEP_ELT(1, w1, fun, s, pp4b); \ + STEP_ELT(2, w2, fun, s, pp4b); \ + STEP_ELT(3, w3, fun, s, pp4b); \ + } while (0) + +#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \ + u32 tA0 = ROL32(A0, r); \ + u32 tA1 = ROL32(A1, r); \ + u32 tA2 = ROL32(A2, r); \ + u32 tA3 = ROL32(A3, r); \ + u32 tA4 = ROL32(A4, r); \ + u32 tA5 = ROL32(A5, r); \ + u32 tA6 = ROL32(A6, r); \ + u32 tA7 = ROL32(A7, r); \ + STEP_ELT(0, w0, fun, s, pp8b); \ + STEP_ELT(1, w1, fun, s, pp8b); \ + STEP_ELT(2, w2, fun, s, pp8b); \ + STEP_ELT(3, w3, fun, s, pp8b); \ + STEP_ELT(4, w4, fun, s, pp8b); \ + STEP_ELT(5, w5, fun, s, pp8b); \ + STEP_ELT(6, w6, fun, s, pp8b); \ + STEP_ELT(7, w7, fun, s, pp8b); \ + } while (0) + +#define M3_0_0 0_ +#define M3_1_0 1_ +#define M3_2_0 2_ +#define M3_3_0 0_ +#define M3_4_0 1_ +#define M3_5_0 2_ +#define M3_6_0 0_ +#define M3_7_0 1_ + +#define M3_0_1 1_ +#define M3_1_1 2_ +#define M3_2_1 0_ +#define M3_3_1 1_ +#define M3_4_1 2_ +#define M3_5_1 0_ +#define M3_6_1 1_ +#define M3_7_1 2_ + +#define M3_0_2 2_ +#define M3_1_2 0_ +#define M3_2_2 1_ +#define M3_3_2 2_ +#define M3_4_2 0_ +#define M3_5_2 1_ +#define M3_6_2 2_ +#define M3_7_2 0_ + +#define STEP_SMALL_(w, fun, r, s, pp4b) STEP_SMALL w, fun, r, s, pp4b) + +#define ONE_ROUND_SMALL(ri, isp, p0, p1, p2, p3) do { \ + STEP_SMALL_(WS_ ## ri ## 0, \ + IF, p0, p1, XCAT(PP4_, M3_0_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 1, \ + IF, p1, p2, XCAT(PP4_, M3_1_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 2, \ + IF, p2, p3, XCAT(PP4_, M3_2_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 3, \ + IF, p3, p0, XCAT(PP4_, M3_3_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 4, \ + MAJ, p0, p1, XCAT(PP4_, M3_4_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 5, \ + MAJ, p1, p2, XCAT(PP4_, M3_5_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 6, \ + MAJ, p2, p3, XCAT(PP4_, M3_6_ ## isp)); \ + STEP_SMALL_(WS_ ## ri ## 7, \ + MAJ, p3, p0, XCAT(PP4_, M3_7_ ## isp)); \ + } while (0) + +#define M7_0_0 0_ +#define M7_1_0 1_ +#define M7_2_0 2_ +#define M7_3_0 3_ +#define M7_4_0 4_ +#define M7_5_0 5_ +#define M7_6_0 6_ +#define M7_7_0 0_ + +#define M7_0_1 1_ +#define M7_1_1 2_ +#define M7_2_1 3_ +#define M7_3_1 4_ +#define M7_4_1 5_ +#define M7_5_1 6_ +#define M7_6_1 0_ +#define M7_7_1 1_ + +#define M7_0_2 2_ +#define M7_1_2 3_ +#define M7_2_2 4_ +#define M7_3_2 5_ +#define M7_4_2 6_ +#define M7_5_2 0_ +#define M7_6_2 1_ +#define M7_7_2 2_ + +#define M7_0_3 3_ +#define M7_1_3 4_ +#define M7_2_3 5_ +#define M7_3_3 6_ +#define M7_4_3 0_ +#define M7_5_3 1_ +#define M7_6_3 2_ +#define M7_7_3 3_ + +#define STEP_BIG_(w, fun, r, s, pp8b) STEP_BIG w, fun, r, s, pp8b) + +#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3) do { \ + STEP_BIG_(WB_ ## ri ## 0, \ + IF, p0, p1, XCAT(PP8_, M7_0_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 1, \ + IF, p1, p2, XCAT(PP8_, M7_1_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 2, \ + IF, p2, p3, XCAT(PP8_, M7_2_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 3, \ + IF, p3, p0, XCAT(PP8_, M7_3_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 4, \ + MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 5, \ + MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 6, \ + MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 7, \ + MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \ + } while (0) + +#if SPH_SMALL_FOOTPRINT_SIMD + +#define A0 state[ 0] +#define A1 state[ 1] +#define A2 state[ 2] +#define A3 state[ 3] +#define B0 state[ 4] +#define B1 state[ 5] +#define B2 state[ 6] +#define B3 state[ 7] +#define C0 state[ 8] +#define C1 state[ 9] +#define C2 state[10] +#define C3 state[11] +#define D0 state[12] +#define D1 state[13] +#define D2 state[14] +#define D3 state[15] + +#define STEP2_ELT(n, w, fun, s, ppb) do { \ + u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ + A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \ + D ## n = C ## n; \ + C ## n = B ## n; \ + B ## n = tA[n]; \ + } while (0) + +#define STEP2_SMALL(w0, w1, w2, w3, fun, r, s, pp4b) do { \ + u32 tA[4]; \ + tA[0] = ROL32(A0, r); \ + tA[1] = ROL32(A1, r); \ + tA[2] = ROL32(A2, r); \ + tA[3] = ROL32(A3, r); \ + STEP2_ELT(0, w0, fun, s, pp4b); \ + STEP2_ELT(1, w1, fun, s, pp4b); \ + STEP2_ELT(2, w2, fun, s, pp4b); \ + STEP2_ELT(3, w3, fun, s, pp4b); \ + } while (0) + +static void +one_round_small(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3) +{ + static const int pp4k[] = { 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2 }; + + STEP2_SMALL(w[ 0], w[ 1], w[ 2], w[ 3], IF, p0, p1, pp4k[isp + 0]); + STEP2_SMALL(w[ 4], w[ 5], w[ 6], w[ 7], IF, p1, p2, pp4k[isp + 1]); + STEP2_SMALL(w[ 8], w[ 9], w[10], w[11], IF, p2, p3, pp4k[isp + 2]); + STEP2_SMALL(w[12], w[13], w[14], w[15], IF, p3, p0, pp4k[isp + 3]); + STEP2_SMALL(w[16], w[17], w[18], w[19], MAJ, p0, p1, pp4k[isp + 4]); + STEP2_SMALL(w[20], w[21], w[22], w[23], MAJ, p1, p2, pp4k[isp + 5]); + STEP2_SMALL(w[24], w[25], w[26], w[27], MAJ, p2, p3, pp4k[isp + 6]); + STEP2_SMALL(w[28], w[29], w[30], w[31], MAJ, p3, p0, pp4k[isp + 7]); +} + +static void +compress_small(sph_simd_small_context *sc, int last) +{ + unsigned char *x; + s32 q[128]; + int i; + u32 w[32]; + u32 state[16]; + size_t u; + + static const size_t wsp[32] = { + 4 << 3, 6 << 3, 0 << 3, 2 << 3, + 7 << 3, 5 << 3, 3 << 3, 1 << 3, + 15 << 3, 11 << 3, 12 << 3, 8 << 3, + 9 << 3, 13 << 3, 10 << 3, 14 << 3, + 17 << 3, 18 << 3, 23 << 3, 20 << 3, + 22 << 3, 21 << 3, 16 << 3, 19 << 3, + 30 << 3, 24 << 3, 25 << 3, 31 << 3, + 27 << 3, 29 << 3, 28 << 3, 26 << 3 + }; + + x = sc->buf; + FFT128(0, 1, 0, ll); + if (last) { + for (i = 0; i < 128; i ++) { + s32 tq; + + tq = q[i] + yoff_s_f[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } else { + for (i = 0; i < 128; i ++) { + s32 tq; + + tq = q[i] + yoff_s_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } + + for (i = 0; i < 16; i += 4) { + state[i + 0] = sc->state[i + 0] + ^ sph_dec32le_aligned(x + 4 * (i + 0)); + state[i + 1] = sc->state[i + 1] + ^ sph_dec32le_aligned(x + 4 * (i + 1)); + state[i + 2] = sc->state[i + 2] + ^ sph_dec32le_aligned(x + 4 * (i + 2)); + state[i + 3] = sc->state[i + 3] + ^ sph_dec32le_aligned(x + 4 * (i + 3)); + } + +#define WSREAD(sb, o1, o2, mm) do { \ + for (u = 0; u < 32; u += 4) { \ + size_t v = wsp[(u >> 2) + (sb)]; \ + w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \ + q[v + 2 * 0 + (o2)], mm); \ + w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \ + q[v + 2 * 1 + (o2)], mm); \ + w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \ + q[v + 2 * 2 + (o2)], mm); \ + w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \ + q[v + 2 * 3 + (o2)], mm); \ + } \ + } while (0) + + WSREAD( 0, 0, 1, 185); + one_round_small(state, w, 0, 3, 23, 17, 27); + WSREAD( 8, 0, 1, 185); + one_round_small(state, w, 2, 28, 19, 22, 7); + WSREAD(16, -128, -64, 233); + one_round_small(state, w, 1, 29, 9, 15, 5); + WSREAD(24, -191, -127, 233); + one_round_small(state, w, 0, 4, 13, 10, 25); + +#undef WSREAD + + STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], + IF, 4, 13, PP4_2_); + STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], + IF, 13, 10, PP4_0_); + STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], + IF, 10, 25, PP4_1_); + STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15], + IF, 25, 4, PP4_2_); + + memcpy(sc->state, state, sizeof state); +} + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef D0 +#undef D1 +#undef D2 +#undef D3 + +#else + +#if SPH_SIMD_NOCOPY +#define A0 (sc->state[ 0]) +#define A1 (sc->state[ 1]) +#define A2 (sc->state[ 2]) +#define A3 (sc->state[ 3]) +#define B0 (sc->state[ 4]) +#define B1 (sc->state[ 5]) +#define B2 (sc->state[ 6]) +#define B3 (sc->state[ 7]) +#define C0 (sc->state[ 8]) +#define C1 (sc->state[ 9]) +#define C2 (sc->state[10]) +#define C3 (sc->state[11]) +#define D0 (sc->state[12]) +#define D1 (sc->state[13]) +#define D2 (sc->state[14]) +#define D3 (sc->state[15]) +#endif + +static void +compress_small(sph_simd_small_context *sc, int last) +{ + unsigned char *x; + s32 q[128]; + int i; + DECL_STATE_SMALL +#if SPH_SIMD_NOCOPY + sph_u32 saved[16]; +#endif + +#if SPH_SIMD_NOCOPY + memcpy(saved, sc->state, sizeof saved); +#endif + x = sc->buf; + FFT128(0, 1, 0, ll); + if (last) { + for (i = 0; i < 128; i ++) { + s32 tq; + + tq = q[i] + yoff_s_f[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } else { + for (i = 0; i < 128; i ++) { + s32 tq; + + tq = q[i] + yoff_s_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } + READ_STATE_SMALL(sc); + A0 ^= sph_dec32le_aligned(x + 0); + A1 ^= sph_dec32le_aligned(x + 4); + A2 ^= sph_dec32le_aligned(x + 8); + A3 ^= sph_dec32le_aligned(x + 12); + B0 ^= sph_dec32le_aligned(x + 16); + B1 ^= sph_dec32le_aligned(x + 20); + B2 ^= sph_dec32le_aligned(x + 24); + B3 ^= sph_dec32le_aligned(x + 28); + C0 ^= sph_dec32le_aligned(x + 32); + C1 ^= sph_dec32le_aligned(x + 36); + C2 ^= sph_dec32le_aligned(x + 40); + C3 ^= sph_dec32le_aligned(x + 44); + D0 ^= sph_dec32le_aligned(x + 48); + D1 ^= sph_dec32le_aligned(x + 52); + D2 ^= sph_dec32le_aligned(x + 56); + D3 ^= sph_dec32le_aligned(x + 60); + ONE_ROUND_SMALL(0_, 0, 3, 23, 17, 27); + ONE_ROUND_SMALL(1_, 2, 28, 19, 22, 7); + ONE_ROUND_SMALL(2_, 1, 29, 9, 15, 5); + ONE_ROUND_SMALL(3_, 0, 4, 13, 10, 25); +#if SPH_SIMD_NOCOPY + STEP_SMALL(saved[ 0], saved[ 1], saved[ 2], saved[ 3], + IF, 4, 13, PP4_2_); + STEP_SMALL(saved[ 4], saved[ 5], saved[ 6], saved[ 7], + IF, 13, 10, PP4_0_); + STEP_SMALL(saved[ 8], saved[ 9], saved[10], saved[11], + IF, 10, 25, PP4_1_); + STEP_SMALL(saved[12], saved[13], saved[14], saved[15], + IF, 25, 4, PP4_2_); +#else + STEP_SMALL(sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], + IF, 4, 13, PP4_2_); + STEP_SMALL(sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], + IF, 13, 10, PP4_0_); + STEP_SMALL(sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], + IF, 10, 25, PP4_1_); + STEP_SMALL(sc->state[12], sc->state[13], sc->state[14], sc->state[15], + IF, 25, 4, PP4_2_); + WRITE_STATE_SMALL(sc); +#endif +} + +#if SPH_SIMD_NOCOPY +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef D0 +#undef D1 +#undef D2 +#undef D3 +#endif + +#endif + +#if SPH_SMALL_FOOTPRINT_SIMD + +#define A0 state[ 0] +#define A1 state[ 1] +#define A2 state[ 2] +#define A3 state[ 3] +#define A4 state[ 4] +#define A5 state[ 5] +#define A6 state[ 6] +#define A7 state[ 7] +#define B0 state[ 8] +#define B1 state[ 9] +#define B2 state[10] +#define B3 state[11] +#define B4 state[12] +#define B5 state[13] +#define B6 state[14] +#define B7 state[15] +#define C0 state[16] +#define C1 state[17] +#define C2 state[18] +#define C3 state[19] +#define C4 state[20] +#define C5 state[21] +#define C6 state[22] +#define C7 state[23] +#define D0 state[24] +#define D1 state[25] +#define D2 state[26] +#define D3 state[27] +#define D4 state[28] +#define D5 state[29] +#define D6 state[30] +#define D7 state[31] + +/* + * Not needed -- already defined for SIMD-224 / SIMD-256 + * +#define STEP2_ELT(n, w, fun, s, ppb) do { \ + u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ + A ## n = T32(ROL32(tt, s) + tA[(ppb) ^ n]); \ + D ## n = C ## n; \ + C ## n = B ## n; \ + B ## n = tA[n]; \ + } while (0) + */ + +#define STEP2_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \ + u32 tA[8]; \ + tA[0] = ROL32(A0, r); \ + tA[1] = ROL32(A1, r); \ + tA[2] = ROL32(A2, r); \ + tA[3] = ROL32(A3, r); \ + tA[4] = ROL32(A4, r); \ + tA[5] = ROL32(A5, r); \ + tA[6] = ROL32(A6, r); \ + tA[7] = ROL32(A7, r); \ + STEP2_ELT(0, w0, fun, s, pp8b); \ + STEP2_ELT(1, w1, fun, s, pp8b); \ + STEP2_ELT(2, w2, fun, s, pp8b); \ + STEP2_ELT(3, w3, fun, s, pp8b); \ + STEP2_ELT(4, w4, fun, s, pp8b); \ + STEP2_ELT(5, w5, fun, s, pp8b); \ + STEP2_ELT(6, w6, fun, s, pp8b); \ + STEP2_ELT(7, w7, fun, s, pp8b); \ + } while (0) + +static void +one_round_big(u32 *state, u32 *w, int isp, int p0, int p1, int p2, int p3) +{ + static const int pp8k[] = { 1, 6, 2, 3, 5, 7, 4, 1, 6, 2, 3 }; + + STEP2_BIG(w[ 0], w[ 1], w[ 2], w[ 3], w[ 4], w[ 5], w[ 6], w[ 7], + IF, p0, p1, pp8k[isp + 0]); + STEP2_BIG(w[ 8], w[ 9], w[10], w[11], w[12], w[13], w[14], w[15], + IF, p1, p2, pp8k[isp + 1]); + STEP2_BIG(w[16], w[17], w[18], w[19], w[20], w[21], w[22], w[23], + IF, p2, p3, pp8k[isp + 2]); + STEP2_BIG(w[24], w[25], w[26], w[27], w[28], w[29], w[30], w[31], + IF, p3, p0, pp8k[isp + 3]); + STEP2_BIG(w[32], w[33], w[34], w[35], w[36], w[37], w[38], w[39], + MAJ, p0, p1, pp8k[isp + 4]); + STEP2_BIG(w[40], w[41], w[42], w[43], w[44], w[45], w[46], w[47], + MAJ, p1, p2, pp8k[isp + 5]); + STEP2_BIG(w[48], w[49], w[50], w[51], w[52], w[53], w[54], w[55], + MAJ, p2, p3, pp8k[isp + 6]); + STEP2_BIG(w[56], w[57], w[58], w[59], w[60], w[61], w[62], w[63], + MAJ, p3, p0, pp8k[isp + 7]); +} + +static void +compress_big(sph_simd_big_context *sc, int last) +{ + unsigned char *x; + s32 q[256]; + int i; + u32 w[64]; + u32 state[32]; + size_t u; + + static const size_t wbp[32] = { + 4 << 4, 6 << 4, 0 << 4, 2 << 4, + 7 << 4, 5 << 4, 3 << 4, 1 << 4, + 15 << 4, 11 << 4, 12 << 4, 8 << 4, + 9 << 4, 13 << 4, 10 << 4, 14 << 4, + 17 << 4, 18 << 4, 23 << 4, 20 << 4, + 22 << 4, 21 << 4, 16 << 4, 19 << 4, + 30 << 4, 24 << 4, 25 << 4, 31 << 4, + 27 << 4, 29 << 4, 28 << 4, 26 << 4 + }; + + x = sc->buf; + FFT256(0, 1, 0, ll); + if (last) { + for (i = 0; i < 256; i ++) { + s32 tq; + + tq = q[i] + yoff_b_f[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } else { + for (i = 0; i < 256; i ++) { + s32 tq; + + tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } + + for (i = 0; i < 32; i += 8) { + state[i + 0] = sc->state[i + 0] + ^ sph_dec32le_aligned(x + 4 * (i + 0)); + state[i + 1] = sc->state[i + 1] + ^ sph_dec32le_aligned(x + 4 * (i + 1)); + state[i + 2] = sc->state[i + 2] + ^ sph_dec32le_aligned(x + 4 * (i + 2)); + state[i + 3] = sc->state[i + 3] + ^ sph_dec32le_aligned(x + 4 * (i + 3)); + state[i + 4] = sc->state[i + 4] + ^ sph_dec32le_aligned(x + 4 * (i + 4)); + state[i + 5] = sc->state[i + 5] + ^ sph_dec32le_aligned(x + 4 * (i + 5)); + state[i + 6] = sc->state[i + 6] + ^ sph_dec32le_aligned(x + 4 * (i + 6)); + state[i + 7] = sc->state[i + 7] + ^ sph_dec32le_aligned(x + 4 * (i + 7)); + } + +#define WBREAD(sb, o1, o2, mm) do { \ + for (u = 0; u < 64; u += 8) { \ + size_t v = wbp[(u >> 3) + (sb)]; \ + w[u + 0] = INNER(q[v + 2 * 0 + (o1)], \ + q[v + 2 * 0 + (o2)], mm); \ + w[u + 1] = INNER(q[v + 2 * 1 + (o1)], \ + q[v + 2 * 1 + (o2)], mm); \ + w[u + 2] = INNER(q[v + 2 * 2 + (o1)], \ + q[v + 2 * 2 + (o2)], mm); \ + w[u + 3] = INNER(q[v + 2 * 3 + (o1)], \ + q[v + 2 * 3 + (o2)], mm); \ + w[u + 4] = INNER(q[v + 2 * 4 + (o1)], \ + q[v + 2 * 4 + (o2)], mm); \ + w[u + 5] = INNER(q[v + 2 * 5 + (o1)], \ + q[v + 2 * 5 + (o2)], mm); \ + w[u + 6] = INNER(q[v + 2 * 6 + (o1)], \ + q[v + 2 * 6 + (o2)], mm); \ + w[u + 7] = INNER(q[v + 2 * 7 + (o1)], \ + q[v + 2 * 7 + (o2)], mm); \ + } \ + } while (0) + + WBREAD( 0, 0, 1, 185); + one_round_big(state, w, 0, 3, 23, 17, 27); + WBREAD( 8, 0, 1, 185); + one_round_big(state, w, 1, 28, 19, 22, 7); + WBREAD(16, -256, -128, 233); + one_round_big(state, w, 2, 29, 9, 15, 5); + WBREAD(24, -383, -255, 233); + one_round_big(state, w, 3, 4, 13, 10, 25); + +#undef WBREAD + + STEP_BIG( + sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], + sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], + IF, 4, 13, PP8_4_); + STEP_BIG( + sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], + sc->state[12], sc->state[13], sc->state[14], sc->state[15], + IF, 13, 10, PP8_5_); + STEP_BIG( + sc->state[16], sc->state[17], sc->state[18], sc->state[19], + sc->state[20], sc->state[21], sc->state[22], sc->state[23], + IF, 10, 25, PP8_6_); + STEP_BIG( + sc->state[24], sc->state[25], sc->state[26], sc->state[27], + sc->state[28], sc->state[29], sc->state[30], sc->state[31], + IF, 25, 4, PP8_0_); + + memcpy(sc->state, state, sizeof state); +} + +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef A4 +#undef A5 +#undef A6 +#undef A7 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef B4 +#undef B5 +#undef B6 +#undef B7 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#undef D0 +#undef D1 +#undef D2 +#undef D3 +#undef D4 +#undef D5 +#undef D6 +#undef D7 + +#else + +#if SPH_SIMD_NOCOPY +#define A0 (sc->state[ 0]) +#define A1 (sc->state[ 1]) +#define A2 (sc->state[ 2]) +#define A3 (sc->state[ 3]) +#define A4 (sc->state[ 4]) +#define A5 (sc->state[ 5]) +#define A6 (sc->state[ 6]) +#define A7 (sc->state[ 7]) +#define B0 (sc->state[ 8]) +#define B1 (sc->state[ 9]) +#define B2 (sc->state[10]) +#define B3 (sc->state[11]) +#define B4 (sc->state[12]) +#define B5 (sc->state[13]) +#define B6 (sc->state[14]) +#define B7 (sc->state[15]) +#define C0 (sc->state[16]) +#define C1 (sc->state[17]) +#define C2 (sc->state[18]) +#define C3 (sc->state[19]) +#define C4 (sc->state[20]) +#define C5 (sc->state[21]) +#define C6 (sc->state[22]) +#define C7 (sc->state[23]) +#define D0 (sc->state[24]) +#define D1 (sc->state[25]) +#define D2 (sc->state[26]) +#define D3 (sc->state[27]) +#define D4 (sc->state[28]) +#define D5 (sc->state[29]) +#define D6 (sc->state[30]) +#define D7 (sc->state[31]) +#endif + +static void +compress_big(sph_simd_big_context *sc, int last) +{ + unsigned char *x; + s32 q[256]; + int i; + DECL_STATE_BIG +#if SPH_SIMD_NOCOPY + sph_u32 saved[32]; +#endif + +#if SPH_SIMD_NOCOPY + memcpy(saved, sc->state, sizeof saved); +#endif + + x = sc->buf; + FFT256(0, 1, 0, ll); + if (last) { + for (i = 0; i < 256; i ++) { + s32 tq; + + tq = q[i] + yoff_b_f[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } else { + for (i = 0; i < 256; i ++) { + s32 tq; + + tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + } + READ_STATE_BIG(sc); + A0 ^= sph_dec32le_aligned(x + 0); + A1 ^= sph_dec32le_aligned(x + 4); + A2 ^= sph_dec32le_aligned(x + 8); + A3 ^= sph_dec32le_aligned(x + 12); + A4 ^= sph_dec32le_aligned(x + 16); + A5 ^= sph_dec32le_aligned(x + 20); + A6 ^= sph_dec32le_aligned(x + 24); + A7 ^= sph_dec32le_aligned(x + 28); + B0 ^= sph_dec32le_aligned(x + 32); + B1 ^= sph_dec32le_aligned(x + 36); + B2 ^= sph_dec32le_aligned(x + 40); + B3 ^= sph_dec32le_aligned(x + 44); + B4 ^= sph_dec32le_aligned(x + 48); + B5 ^= sph_dec32le_aligned(x + 52); + B6 ^= sph_dec32le_aligned(x + 56); + B7 ^= sph_dec32le_aligned(x + 60); + C0 ^= sph_dec32le_aligned(x + 64); + C1 ^= sph_dec32le_aligned(x + 68); + C2 ^= sph_dec32le_aligned(x + 72); + C3 ^= sph_dec32le_aligned(x + 76); + C4 ^= sph_dec32le_aligned(x + 80); + C5 ^= sph_dec32le_aligned(x + 84); + C6 ^= sph_dec32le_aligned(x + 88); + C7 ^= sph_dec32le_aligned(x + 92); + D0 ^= sph_dec32le_aligned(x + 96); + D1 ^= sph_dec32le_aligned(x + 100); + D2 ^= sph_dec32le_aligned(x + 104); + D3 ^= sph_dec32le_aligned(x + 108); + D4 ^= sph_dec32le_aligned(x + 112); + D5 ^= sph_dec32le_aligned(x + 116); + D6 ^= sph_dec32le_aligned(x + 120); + D7 ^= sph_dec32le_aligned(x + 124); + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); +#if SPH_SIMD_NOCOPY + STEP_BIG( + saved[ 0], saved[ 1], saved[ 2], saved[ 3], + saved[ 4], saved[ 5], saved[ 6], saved[ 7], + IF, 4, 13, PP8_4_); + STEP_BIG( + saved[ 8], saved[ 9], saved[10], saved[11], + saved[12], saved[13], saved[14], saved[15], + IF, 13, 10, PP8_5_); + STEP_BIG( + saved[16], saved[17], saved[18], saved[19], + saved[20], saved[21], saved[22], saved[23], + IF, 10, 25, PP8_6_); + STEP_BIG( + saved[24], saved[25], saved[26], saved[27], + saved[28], saved[29], saved[30], saved[31], + IF, 25, 4, PP8_0_); +#else + STEP_BIG( + sc->state[ 0], sc->state[ 1], sc->state[ 2], sc->state[ 3], + sc->state[ 4], sc->state[ 5], sc->state[ 6], sc->state[ 7], + IF, 4, 13, PP8_4_); + STEP_BIG( + sc->state[ 8], sc->state[ 9], sc->state[10], sc->state[11], + sc->state[12], sc->state[13], sc->state[14], sc->state[15], + IF, 13, 10, PP8_5_); + STEP_BIG( + sc->state[16], sc->state[17], sc->state[18], sc->state[19], + sc->state[20], sc->state[21], sc->state[22], sc->state[23], + IF, 10, 25, PP8_6_); + STEP_BIG( + sc->state[24], sc->state[25], sc->state[26], sc->state[27], + sc->state[28], sc->state[29], sc->state[30], sc->state[31], + IF, 25, 4, PP8_0_); + WRITE_STATE_BIG(sc); +#endif +} + +#if SPH_SIMD_NOCOPY +#undef A0 +#undef A1 +#undef A2 +#undef A3 +#undef A4 +#undef A5 +#undef A6 +#undef A7 +#undef B0 +#undef B1 +#undef B2 +#undef B3 +#undef B4 +#undef B5 +#undef B6 +#undef B7 +#undef C0 +#undef C1 +#undef C2 +#undef C3 +#undef C4 +#undef C5 +#undef C6 +#undef C7 +#undef D0 +#undef D1 +#undef D2 +#undef D3 +#undef D4 +#undef D5 +#undef D6 +#undef D7 +#endif + +#endif + +static const u32 IV224[] = { + C32(0x33586E9F), C32(0x12FFF033), C32(0xB2D9F64D), C32(0x6F8FEA53), + C32(0xDE943106), C32(0x2742E439), C32(0x4FBAB5AC), C32(0x62B9FF96), + C32(0x22E7B0AF), C32(0xC862B3A8), C32(0x33E00CDC), C32(0x236B86A6), + C32(0xF64AE77C), C32(0xFA373B76), C32(0x7DC1EE5B), C32(0x7FB29CE8) +}; + +static const u32 IV256[] = { + C32(0x4D567983), C32(0x07190BA9), C32(0x8474577B), C32(0x39D726E9), + C32(0xAAF3D925), C32(0x3EE20B03), C32(0xAFD5E751), C32(0xC96006D3), + C32(0xC2C2BA14), C32(0x49B3BCB4), C32(0xF67CAF46), C32(0x668626C9), + C32(0xE2EAA8D2), C32(0x1FF47833), C32(0xD0C661A5), C32(0x55693DE1) +}; + +static const u32 IV384[] = { + C32(0x8A36EEBC), C32(0x94A3BD90), C32(0xD1537B83), C32(0xB25B070B), + C32(0xF463F1B5), C32(0xB6F81E20), C32(0x0055C339), C32(0xB4D144D1), + C32(0x7360CA61), C32(0x18361A03), C32(0x17DCB4B9), C32(0x3414C45A), + C32(0xA699A9D2), C32(0xE39E9664), C32(0x468BFE77), C32(0x51D062F8), + C32(0xB9E3BFE8), C32(0x63BECE2A), C32(0x8FE506B9), C32(0xF8CC4AC2), + C32(0x7AE11542), C32(0xB1AADDA1), C32(0x64B06794), C32(0x28D2F462), + C32(0xE64071EC), C32(0x1DEB91A8), C32(0x8AC8DB23), C32(0x3F782AB5), + C32(0x039B5CB8), C32(0x71DDD962), C32(0xFADE2CEA), C32(0x1416DF71) +}; + +static const u32 IV512[] = { + C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), + C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), + C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), + C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), + C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), + C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), + C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), + C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22) +}; + +static void +init_small(void *cc, const u32 *iv) +{ + sph_simd_small_context *sc; + + sc = cc; + memcpy(sc->state, iv, sizeof sc->state); + sc->count_low = sc->count_high = 0; + sc->ptr = 0; +} + +static void +init_big(void *cc, const u32 *iv) +{ + sph_simd_big_context *sc; + + sc = cc; + memcpy(sc->state, iv, sizeof sc->state); + sc->count_low = sc->count_high = 0; + sc->ptr = 0; +} + +static void +update_small(void *cc, const void *data, size_t len) +{ + sph_simd_small_context *sc; + + sc = cc; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - sc->ptr; + if (clen > len) + clen = len; + memcpy(sc->buf + sc->ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + if ((sc->ptr += clen) == sizeof sc->buf) { + compress_small(sc, 0); + sc->ptr = 0; + sc->count_low = T32(sc->count_low + 1); + if (sc->count_low == 0) + sc->count_high ++; + } + } +} + +static void +update_big(void *cc, const void *data, size_t len) +{ + sph_simd_big_context *sc; + + sc = cc; + while (len > 0) { + size_t clen; + + clen = (sizeof sc->buf) - sc->ptr; + if (clen > len) + clen = len; + memcpy(sc->buf + sc->ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + if ((sc->ptr += clen) == sizeof sc->buf) { + compress_big(sc, 0); + sc->ptr = 0; + sc->count_low = T32(sc->count_low + 1); + if (sc->count_low == 0) + sc->count_high ++; + } + } +} + +static void +encode_count_small(unsigned char *dst, + u32 low, u32 high, size_t ptr, unsigned n) +{ + low = T32(low << 9); + high = T32(high << 9) + (low >> 23); + low += (ptr << 3) + n; + sph_enc32le(dst, low); + sph_enc32le(dst + 4, high); +} + +static void +encode_count_big(unsigned char *dst, + u32 low, u32 high, size_t ptr, unsigned n) +{ + low = T32(low << 10); + high = T32(high << 10) + (low >> 22); + low += (ptr << 3) + n; + sph_enc32le(dst, low); + sph_enc32le(dst + 4, high); +} + +static void +finalize_small(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len) +{ + sph_simd_small_context *sc; + unsigned char *d; + size_t u; + + sc = cc; + if (sc->ptr > 0 || n > 0) { + memset(sc->buf + sc->ptr, 0, + (sizeof sc->buf) - sc->ptr); + sc->buf[sc->ptr] = ub & (0xFF << (8 - n)); + compress_small(sc, 0); + } + memset(sc->buf, 0, sizeof sc->buf); + encode_count_small(sc->buf, sc->count_low, sc->count_high, sc->ptr, n); + compress_small(sc, 1); + d = dst; + for (d = dst, u = 0; u < dst_len; u ++) + sph_enc32le(d + (u << 2), sc->state[u]); +} + +static void +finalize_big(void *cc, unsigned ub, unsigned n, void *dst, size_t dst_len) +{ + sph_simd_big_context *sc; + unsigned char *d; + size_t u; + + sc = cc; + if (sc->ptr > 0 || n > 0) { + memset(sc->buf + sc->ptr, 0, + (sizeof sc->buf) - sc->ptr); + sc->buf[sc->ptr] = ub & (0xFF << (8 - n)); + compress_big(sc, 0); + } + memset(sc->buf, 0, sizeof sc->buf); + encode_count_big(sc->buf, sc->count_low, sc->count_high, sc->ptr, n); + compress_big(sc, 1); + d = dst; + for (d = dst, u = 0; u < dst_len; u ++) + sph_enc32le(d + (u << 2), sc->state[u]); +} + +void +sph_simd224_init(void *cc) +{ + init_small(cc, IV224); +} + +void +sph_simd224(void *cc, const void *data, size_t len) +{ + update_small(cc, data, len); +} + +void +sph_simd224_close(void *cc, void *dst) +{ + sph_simd224_addbits_and_close(cc, 0, 0, dst); +} + +void +sph_simd224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + finalize_small(cc, ub, n, dst, 7); + sph_simd224_init(cc); +} + +void +sph_simd256_init(void *cc) +{ + init_small(cc, IV256); +} + +void +sph_simd256(void *cc, const void *data, size_t len) +{ + update_small(cc, data, len); +} + +void +sph_simd256_close(void *cc, void *dst) +{ + sph_simd256_addbits_and_close(cc, 0, 0, dst); +} + +void +sph_simd256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + finalize_small(cc, ub, n, dst, 8); + sph_simd256_init(cc); +} + +void +sph_simd384_init(void *cc) +{ + init_big(cc, IV384); +} + +void +sph_simd384(void *cc, const void *data, size_t len) +{ + update_big(cc, data, len); +} + +void +sph_simd384_close(void *cc, void *dst) +{ + sph_simd384_addbits_and_close(cc, 0, 0, dst); +} + +void +sph_simd384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + finalize_big(cc, ub, n, dst, 12); + sph_simd384_init(cc); +} + +void +sph_simd512_init(void *cc) +{ + init_big(cc, IV512); +} + +void +sph_simd512(void *cc, const void *data, size_t len) +{ + update_big(cc, data, len); +} + +void +sph_simd512_close(void *cc, void *dst) +{ + sph_simd512_addbits_and_close(cc, 0, 0, dst); +} + +void +sph_simd512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + finalize_big(cc, ub, n, dst, 16); + sph_simd512_init(cc); +} +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/sha3/sph_simd.h b/sha3/sph_simd.h new file mode 100644 index 00000000..92ee1e72 --- /dev/null +++ b/sha3/sph_simd.h @@ -0,0 +1,309 @@ +/* $Id: sph_simd.h 154 2010-04-26 17:00:24Z tp $ */ +/** + * SIMD interface. SIMD is a family of functions which differ by + * their output size; this implementation defines SIMD for output + * sizes 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_simd.h + * @author Thomas Pornin + */ + +#ifndef SPH_SIMD_H__ +#define SPH_SIMD_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for SIMD-224. + */ +#define SPH_SIZE_simd224 224 + +/** + * Output size (in bits) for SIMD-256. + */ +#define SPH_SIZE_simd256 256 + +/** + * Output size (in bits) for SIMD-384. + */ +#define SPH_SIZE_simd384 384 + +/** + * Output size (in bits) for SIMD-512. + */ +#define SPH_SIZE_simd512 512 + +/** + * This structure is a context for SIMD computations: it contains the + * intermediate values and some data from the last entered block. Once + * an SIMD computation has been performed, the context can be reused for + * another computation. This specific structure is used for SIMD-224 + * and SIMD-256. + * + * The contents of this structure are private. A running SIMD computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u32 state[16]; + sph_u32 count_low, count_high; +#endif +} sph_simd_small_context; + +/** + * This structure is a context for SIMD computations: it contains the + * intermediate values and some data from the last entered block. Once + * an SIMD computation has been performed, the context can be reused for + * another computation. This specific structure is used for SIMD-384 + * and SIMD-512. + * + * The contents of this structure are private. A running SIMD computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[128]; /* first field, for alignment */ + size_t ptr; + sph_u32 state[32]; + sph_u32 count_low, count_high; +#endif +} sph_simd_big_context; + +/** + * Type for a SIMD-224 context (identical to the common "small" context). + */ +typedef sph_simd_small_context sph_simd224_context; + +/** + * Type for a SIMD-256 context (identical to the common "small" context). + */ +typedef sph_simd_small_context sph_simd256_context; + +/** + * Type for a SIMD-384 context (identical to the common "big" context). + */ +typedef sph_simd_big_context sph_simd384_context; + +/** + * Type for a SIMD-512 context (identical to the common "big" context). + */ +typedef sph_simd_big_context sph_simd512_context; + +/** + * Initialize an SIMD-224 context. This process performs no memory allocation. + * + * @param cc the SIMD-224 context (pointer to a + * sph_simd224_context) + */ +void sph_simd224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SIMD-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_simd224(void *cc, const void *data, size_t len); + +/** + * Terminate the current SIMD-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the SIMD-224 context + * @param dst the destination buffer + */ +void sph_simd224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SIMD-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_simd224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an SIMD-256 context. This process performs no memory allocation. + * + * @param cc the SIMD-256 context (pointer to a + * sph_simd256_context) + */ +void sph_simd256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SIMD-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_simd256(void *cc, const void *data, size_t len); + +/** + * Terminate the current SIMD-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the SIMD-256 context + * @param dst the destination buffer + */ +void sph_simd256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SIMD-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_simd256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an SIMD-384 context. This process performs no memory allocation. + * + * @param cc the SIMD-384 context (pointer to a + * sph_simd384_context) + */ +void sph_simd384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SIMD-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_simd384(void *cc, const void *data, size_t len); + +/** + * Terminate the current SIMD-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the SIMD-384 context + * @param dst the destination buffer + */ +void sph_simd384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SIMD-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_simd384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize an SIMD-512 context. This process performs no memory allocation. + * + * @param cc the SIMD-512 context (pointer to a + * sph_simd512_context) + */ +void sph_simd512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the SIMD-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_simd512(void *cc, const void *data, size_t len); + +/** + * Terminate the current SIMD-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the SIMD-512 context + * @param dst the destination buffer + */ +void sph_simd512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the SIMD-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_simd512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_skein.c b/sha3/sph_skein.c new file mode 100644 index 00000000..7e47e352 --- /dev/null +++ b/sha3/sph_skein.c @@ -0,0 +1,1254 @@ +/* $Id: skein.c 254 2011-06-07 19:38:58Z tp $ */ +/* + * Skein implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_skein.h" + +#ifdef __cplusplus +extern "C"{ +#endif + + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SKEIN +#define SPH_SMALL_FOOTPRINT_SKEIN 1 +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#if SPH_64 + +#if 0 +/* obsolete */ +/* + * M5_ ## s ## _ ## i evaluates to s+i mod 5 (0 <= s <= 18, 0 <= i <= 3). + */ + +#define M5_0_0 0 +#define M5_0_1 1 +#define M5_0_2 2 +#define M5_0_3 3 + +#define M5_1_0 1 +#define M5_1_1 2 +#define M5_1_2 3 +#define M5_1_3 4 + +#define M5_2_0 2 +#define M5_2_1 3 +#define M5_2_2 4 +#define M5_2_3 0 + +#define M5_3_0 3 +#define M5_3_1 4 +#define M5_3_2 0 +#define M5_3_3 1 + +#define M5_4_0 4 +#define M5_4_1 0 +#define M5_4_2 1 +#define M5_4_3 2 + +#define M5_5_0 0 +#define M5_5_1 1 +#define M5_5_2 2 +#define M5_5_3 3 + +#define M5_6_0 1 +#define M5_6_1 2 +#define M5_6_2 3 +#define M5_6_3 4 + +#define M5_7_0 2 +#define M5_7_1 3 +#define M5_7_2 4 +#define M5_7_3 0 + +#define M5_8_0 3 +#define M5_8_1 4 +#define M5_8_2 0 +#define M5_8_3 1 + +#define M5_9_0 4 +#define M5_9_1 0 +#define M5_9_2 1 +#define M5_9_3 2 + +#define M5_10_0 0 +#define M5_10_1 1 +#define M5_10_2 2 +#define M5_10_3 3 + +#define M5_11_0 1 +#define M5_11_1 2 +#define M5_11_2 3 +#define M5_11_3 4 + +#define M5_12_0 2 +#define M5_12_1 3 +#define M5_12_2 4 +#define M5_12_3 0 + +#define M5_13_0 3 +#define M5_13_1 4 +#define M5_13_2 0 +#define M5_13_3 1 + +#define M5_14_0 4 +#define M5_14_1 0 +#define M5_14_2 1 +#define M5_14_3 2 + +#define M5_15_0 0 +#define M5_15_1 1 +#define M5_15_2 2 +#define M5_15_3 3 + +#define M5_16_0 1 +#define M5_16_1 2 +#define M5_16_2 3 +#define M5_16_3 4 + +#define M5_17_0 2 +#define M5_17_1 3 +#define M5_17_2 4 +#define M5_17_3 0 + +#define M5_18_0 3 +#define M5_18_1 4 +#define M5_18_2 0 +#define M5_18_3 1 +#endif + +/* + * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7). + */ + +#define M9_0_0 0 +#define M9_0_1 1 +#define M9_0_2 2 +#define M9_0_3 3 +#define M9_0_4 4 +#define M9_0_5 5 +#define M9_0_6 6 +#define M9_0_7 7 + +#define M9_1_0 1 +#define M9_1_1 2 +#define M9_1_2 3 +#define M9_1_3 4 +#define M9_1_4 5 +#define M9_1_5 6 +#define M9_1_6 7 +#define M9_1_7 8 + +#define M9_2_0 2 +#define M9_2_1 3 +#define M9_2_2 4 +#define M9_2_3 5 +#define M9_2_4 6 +#define M9_2_5 7 +#define M9_2_6 8 +#define M9_2_7 0 + +#define M9_3_0 3 +#define M9_3_1 4 +#define M9_3_2 5 +#define M9_3_3 6 +#define M9_3_4 7 +#define M9_3_5 8 +#define M9_3_6 0 +#define M9_3_7 1 + +#define M9_4_0 4 +#define M9_4_1 5 +#define M9_4_2 6 +#define M9_4_3 7 +#define M9_4_4 8 +#define M9_4_5 0 +#define M9_4_6 1 +#define M9_4_7 2 + +#define M9_5_0 5 +#define M9_5_1 6 +#define M9_5_2 7 +#define M9_5_3 8 +#define M9_5_4 0 +#define M9_5_5 1 +#define M9_5_6 2 +#define M9_5_7 3 + +#define M9_6_0 6 +#define M9_6_1 7 +#define M9_6_2 8 +#define M9_6_3 0 +#define M9_6_4 1 +#define M9_6_5 2 +#define M9_6_6 3 +#define M9_6_7 4 + +#define M9_7_0 7 +#define M9_7_1 8 +#define M9_7_2 0 +#define M9_7_3 1 +#define M9_7_4 2 +#define M9_7_5 3 +#define M9_7_6 4 +#define M9_7_7 5 + +#define M9_8_0 8 +#define M9_8_1 0 +#define M9_8_2 1 +#define M9_8_3 2 +#define M9_8_4 3 +#define M9_8_5 4 +#define M9_8_6 5 +#define M9_8_7 6 + +#define M9_9_0 0 +#define M9_9_1 1 +#define M9_9_2 2 +#define M9_9_3 3 +#define M9_9_4 4 +#define M9_9_5 5 +#define M9_9_6 6 +#define M9_9_7 7 + +#define M9_10_0 1 +#define M9_10_1 2 +#define M9_10_2 3 +#define M9_10_3 4 +#define M9_10_4 5 +#define M9_10_5 6 +#define M9_10_6 7 +#define M9_10_7 8 + +#define M9_11_0 2 +#define M9_11_1 3 +#define M9_11_2 4 +#define M9_11_3 5 +#define M9_11_4 6 +#define M9_11_5 7 +#define M9_11_6 8 +#define M9_11_7 0 + +#define M9_12_0 3 +#define M9_12_1 4 +#define M9_12_2 5 +#define M9_12_3 6 +#define M9_12_4 7 +#define M9_12_5 8 +#define M9_12_6 0 +#define M9_12_7 1 + +#define M9_13_0 4 +#define M9_13_1 5 +#define M9_13_2 6 +#define M9_13_3 7 +#define M9_13_4 8 +#define M9_13_5 0 +#define M9_13_6 1 +#define M9_13_7 2 + +#define M9_14_0 5 +#define M9_14_1 6 +#define M9_14_2 7 +#define M9_14_3 8 +#define M9_14_4 0 +#define M9_14_5 1 +#define M9_14_6 2 +#define M9_14_7 3 + +#define M9_15_0 6 +#define M9_15_1 7 +#define M9_15_2 8 +#define M9_15_3 0 +#define M9_15_4 1 +#define M9_15_5 2 +#define M9_15_6 3 +#define M9_15_7 4 + +#define M9_16_0 7 +#define M9_16_1 8 +#define M9_16_2 0 +#define M9_16_3 1 +#define M9_16_4 2 +#define M9_16_5 3 +#define M9_16_6 4 +#define M9_16_7 5 + +#define M9_17_0 8 +#define M9_17_1 0 +#define M9_17_2 1 +#define M9_17_3 2 +#define M9_17_4 3 +#define M9_17_5 4 +#define M9_17_6 5 +#define M9_17_7 6 + +#define M9_18_0 0 +#define M9_18_1 1 +#define M9_18_2 2 +#define M9_18_3 3 +#define M9_18_4 4 +#define M9_18_5 5 +#define M9_18_6 6 +#define M9_18_7 7 + +/* + * M3_ ## s ## _ ## i evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1). + */ + +#define M3_0_0 0 +#define M3_0_1 1 +#define M3_1_0 1 +#define M3_1_1 2 +#define M3_2_0 2 +#define M3_2_1 0 +#define M3_3_0 0 +#define M3_3_1 1 +#define M3_4_0 1 +#define M3_4_1 2 +#define M3_5_0 2 +#define M3_5_1 0 +#define M3_6_0 0 +#define M3_6_1 1 +#define M3_7_0 1 +#define M3_7_1 2 +#define M3_8_0 2 +#define M3_8_1 0 +#define M3_9_0 0 +#define M3_9_1 1 +#define M3_10_0 1 +#define M3_10_1 2 +#define M3_11_0 2 +#define M3_11_1 0 +#define M3_12_0 0 +#define M3_12_1 1 +#define M3_13_0 1 +#define M3_13_1 2 +#define M3_14_0 2 +#define M3_14_1 0 +#define M3_15_0 0 +#define M3_15_1 1 +#define M3_16_0 1 +#define M3_16_1 2 +#define M3_17_0 2 +#define M3_17_1 0 +#define M3_18_0 0 +#define M3_18_1 1 + +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +#if 0 +/* obsolete */ +#define SKSI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M5_, s), _), i)) +#define SKST(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) +#endif + +#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i)) +#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) + +#if 0 +/* obsolete */ +#define TFSMALL_KINIT(k0, k1, k2, k3, k4, t0, t1, t2) do { \ + k4 = (k0 ^ k1) ^ (k2 ^ k3) ^ SPH_C64(0x1BD11BDAA9FC1A22); \ + t2 = t0 ^ t1; \ + } while (0) +#endif + +#define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) do { \ + k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \ + ^ SPH_C64(0x1BD11BDAA9FC1A22); \ + t2 = t0 ^ t1; \ + } while (0) + +#if 0 +/* obsolete */ +#define TFSMALL_ADDKEY(w0, w1, w2, w3, k, t, s) do { \ + w0 = SPH_T64(w0 + SKSI(k, s, 0)); \ + w1 = SPH_T64(w1 + SKSI(k, s, 1) + SKST(t, s, 0)); \ + w2 = SPH_T64(w2 + SKSI(k, s, 2) + SKST(t, s, 1)); \ + w3 = SPH_T64(w3 + SKSI(k, s, 3) + (sph_u64)s); \ + } while (0) +#endif + +#if SPH_SMALL_FOOTPRINT_SKEIN + +#define TFBIG_ADDKEY(s, tt0, tt1) do { \ + p0 = SPH_T64(p0 + h[s + 0]); \ + p1 = SPH_T64(p1 + h[s + 1]); \ + p2 = SPH_T64(p2 + h[s + 2]); \ + p3 = SPH_T64(p3 + h[s + 3]); \ + p4 = SPH_T64(p4 + h[s + 4]); \ + p5 = SPH_T64(p5 + h[s + 5] + tt0); \ + p6 = SPH_T64(p6 + h[s + 6] + tt1); \ + p7 = SPH_T64(p7 + h[s + 7] + (sph_u64)s); \ + } while (0) + +#else + +#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) do { \ + w0 = SPH_T64(w0 + SKBI(k, s, 0)); \ + w1 = SPH_T64(w1 + SKBI(k, s, 1)); \ + w2 = SPH_T64(w2 + SKBI(k, s, 2)); \ + w3 = SPH_T64(w3 + SKBI(k, s, 3)); \ + w4 = SPH_T64(w4 + SKBI(k, s, 4)); \ + w5 = SPH_T64(w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ + w6 = SPH_T64(w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ + w7 = SPH_T64(w7 + SKBI(k, s, 7) + (sph_u64)s); \ + } while (0) + +#endif + +#if 0 +/* obsolete */ +#define TFSMALL_MIX(x0, x1, rc) do { \ + x0 = SPH_T64(x0 + x1); \ + x1 = SPH_ROTL64(x1, rc) ^ x0; \ + } while (0) +#endif + +#define TFBIG_MIX(x0, x1, rc) do { \ + x0 = SPH_T64(x0 + x1); \ + x1 = SPH_ROTL64(x1, rc) ^ x0; \ + } while (0) + +#if 0 +/* obsolete */ +#define TFSMALL_MIX4(w0, w1, w2, w3, rc0, rc1) do { \ + TFSMALL_MIX(w0, w1, rc0); \ + TFSMALL_MIX(w2, w3, rc1); \ + } while (0) +#endif + +#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) do { \ + TFBIG_MIX(w0, w1, rc0); \ + TFBIG_MIX(w2, w3, rc1); \ + TFBIG_MIX(w4, w5, rc2); \ + TFBIG_MIX(w6, w7, rc3); \ + } while (0) + +#if 0 +/* obsolete */ +#define TFSMALL_4e(s) do { \ + TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \ + TFSMALL_MIX4(p0, p1, p2, p3, 14, 16); \ + TFSMALL_MIX4(p0, p3, p2, p1, 52, 57); \ + TFSMALL_MIX4(p0, p1, p2, p3, 23, 40); \ + TFSMALL_MIX4(p0, p3, p2, p1, 5, 37); \ + } while (0) + +#define TFSMALL_4o(s) do { \ + TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, s); \ + TFSMALL_MIX4(p0, p1, p2, p3, 25, 33); \ + TFSMALL_MIX4(p0, p3, p2, p1, 46, 12); \ + TFSMALL_MIX4(p0, p1, p2, p3, 58, 22); \ + TFSMALL_MIX4(p0, p3, p2, p1, 32, 32); \ + } while (0) +#endif + +#if SPH_SMALL_FOOTPRINT_SKEIN + +#define TFBIG_4e(s) do { \ + TFBIG_ADDKEY(s, t0, t1); \ + TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \ + TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \ + TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \ + TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \ + } while (0) + +#define TFBIG_4o(s) do { \ + TFBIG_ADDKEY(s, t1, t2); \ + TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \ + TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \ + TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \ + TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \ + } while (0) + +#else + +#define TFBIG_4e(s) do { \ + TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ + TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); \ + TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); \ + TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); \ + TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); \ + } while (0) + +#define TFBIG_4o(s) do { \ + TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, s); \ + TFBIG_MIX8(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); \ + TFBIG_MIX8(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); \ + TFBIG_MIX8(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); \ + TFBIG_MIX8(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); \ + } while (0) + +#endif + +#if 0 +/* obsolete */ +#define UBI_SMALL(etype, extra) do { \ + sph_u64 h4, t0, t1, t2; \ + sph_u64 m0 = sph_dec64le(buf + 0); \ + sph_u64 m1 = sph_dec64le(buf + 8); \ + sph_u64 m2 = sph_dec64le(buf + 16); \ + sph_u64 m3 = sph_dec64le(buf + 24); \ + sph_u64 p0 = m0; \ + sph_u64 p1 = m1; \ + sph_u64 p2 = m2; \ + sph_u64 p3 = m3; \ + t0 = SPH_T64(bcount << 5) + (sph_u64)(extra); \ + t1 = (bcount >> 59) + ((sph_u64)(etype) << 55); \ + TFSMALL_KINIT(h0, h1, h2, h3, h4, t0, t1, t2); \ + TFSMALL_4e(0); \ + TFSMALL_4o(1); \ + TFSMALL_4e(2); \ + TFSMALL_4o(3); \ + TFSMALL_4e(4); \ + TFSMALL_4o(5); \ + TFSMALL_4e(6); \ + TFSMALL_4o(7); \ + TFSMALL_4e(8); \ + TFSMALL_4o(9); \ + TFSMALL_4e(10); \ + TFSMALL_4o(11); \ + TFSMALL_4e(12); \ + TFSMALL_4o(13); \ + TFSMALL_4e(14); \ + TFSMALL_4o(15); \ + TFSMALL_4e(16); \ + TFSMALL_4o(17); \ + TFSMALL_ADDKEY(p0, p1, p2, p3, h, t, 18); \ + h0 = m0 ^ p0; \ + h1 = m1 ^ p1; \ + h2 = m2 ^ p2; \ + h3 = m3 ^ p3; \ + } while (0) +#endif + +#if SPH_SMALL_FOOTPRINT_SKEIN + +#define UBI_BIG(etype, extra) do { \ + sph_u64 t0, t1, t2; \ + unsigned u; \ + sph_u64 m0 = sph_dec64le_aligned(buf + 0); \ + sph_u64 m1 = sph_dec64le_aligned(buf + 8); \ + sph_u64 m2 = sph_dec64le_aligned(buf + 16); \ + sph_u64 m3 = sph_dec64le_aligned(buf + 24); \ + sph_u64 m4 = sph_dec64le_aligned(buf + 32); \ + sph_u64 m5 = sph_dec64le_aligned(buf + 40); \ + sph_u64 m6 = sph_dec64le_aligned(buf + 48); \ + sph_u64 m7 = sph_dec64le_aligned(buf + 56); \ + sph_u64 p0 = m0; \ + sph_u64 p1 = m1; \ + sph_u64 p2 = m2; \ + sph_u64 p3 = m3; \ + sph_u64 p4 = m4; \ + sph_u64 p5 = m5; \ + sph_u64 p6 = m6; \ + sph_u64 p7 = m7; \ + t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \ + t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \ + TFBIG_KINIT(h[0], h[1], h[2], h[3], h[4], h[5], \ + h[6], h[7], h[8], t0, t1, t2); \ + for (u = 0; u <= 15; u += 3) { \ + h[u + 9] = h[u + 0]; \ + h[u + 10] = h[u + 1]; \ + h[u + 11] = h[u + 2]; \ + } \ + for (u = 0; u < 9; u ++) { \ + sph_u64 s = u << 1; \ + sph_u64 tmp; \ + TFBIG_4e(s); \ + TFBIG_4o(s + 1); \ + tmp = t2; \ + t2 = t1; \ + t1 = t0; \ + t0 = tmp; \ + } \ + TFBIG_ADDKEY(18, t0, t1); \ + h[0] = m0 ^ p0; \ + h[1] = m1 ^ p1; \ + h[2] = m2 ^ p2; \ + h[3] = m3 ^ p3; \ + h[4] = m4 ^ p4; \ + h[5] = m5 ^ p5; \ + h[6] = m6 ^ p6; \ + h[7] = m7 ^ p7; \ + } while (0) + +#else + +#define UBI_BIG(etype, extra) do { \ + sph_u64 h8, t0, t1, t2; \ + sph_u64 m0 = sph_dec64le_aligned(buf + 0); \ + sph_u64 m1 = sph_dec64le_aligned(buf + 8); \ + sph_u64 m2 = sph_dec64le_aligned(buf + 16); \ + sph_u64 m3 = sph_dec64le_aligned(buf + 24); \ + sph_u64 m4 = sph_dec64le_aligned(buf + 32); \ + sph_u64 m5 = sph_dec64le_aligned(buf + 40); \ + sph_u64 m6 = sph_dec64le_aligned(buf + 48); \ + sph_u64 m7 = sph_dec64le_aligned(buf + 56); \ + sph_u64 p0 = m0; \ + sph_u64 p1 = m1; \ + sph_u64 p2 = m2; \ + sph_u64 p3 = m3; \ + sph_u64 p4 = m4; \ + sph_u64 p5 = m5; \ + sph_u64 p6 = m6; \ + sph_u64 p7 = m7; \ + t0 = SPH_T64(bcount << 6) + (sph_u64)(extra); \ + t1 = (bcount >> 58) + ((sph_u64)(etype) << 55); \ + TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); \ + TFBIG_4e(0); \ + TFBIG_4o(1); \ + TFBIG_4e(2); \ + TFBIG_4o(3); \ + TFBIG_4e(4); \ + TFBIG_4o(5); \ + TFBIG_4e(6); \ + TFBIG_4o(7); \ + TFBIG_4e(8); \ + TFBIG_4o(9); \ + TFBIG_4e(10); \ + TFBIG_4o(11); \ + TFBIG_4e(12); \ + TFBIG_4o(13); \ + TFBIG_4e(14); \ + TFBIG_4o(15); \ + TFBIG_4e(16); \ + TFBIG_4o(17); \ + TFBIG_ADDKEY(p0, p1, p2, p3, p4, p5, p6, p7, h, t, 18); \ + h0 = m0 ^ p0; \ + h1 = m1 ^ p1; \ + h2 = m2 ^ p2; \ + h3 = m3 ^ p3; \ + h4 = m4 ^ p4; \ + h5 = m5 ^ p5; \ + h6 = m6 ^ p6; \ + h7 = m7 ^ p7; \ + } while (0) + +#endif + +#if 0 +/* obsolete */ +#define DECL_STATE_SMALL \ + sph_u64 h0, h1, h2, h3; \ + sph_u64 bcount; + +#define READ_STATE_SMALL(sc) do { \ + h0 = (sc)->h0; \ + h1 = (sc)->h1; \ + h2 = (sc)->h2; \ + h3 = (sc)->h3; \ + bcount = sc->bcount; \ + } while (0) + +#define WRITE_STATE_SMALL(sc) do { \ + (sc)->h0 = h0; \ + (sc)->h1 = h1; \ + (sc)->h2 = h2; \ + (sc)->h3 = h3; \ + sc->bcount = bcount; \ + } while (0) +#endif + +#if SPH_SMALL_FOOTPRINT_SKEIN + +#define DECL_STATE_BIG \ + sph_u64 h[27]; \ + sph_u64 bcount; + +#define READ_STATE_BIG(sc) do { \ + h[0] = (sc)->h0; \ + h[1] = (sc)->h1; \ + h[2] = (sc)->h2; \ + h[3] = (sc)->h3; \ + h[4] = (sc)->h4; \ + h[5] = (sc)->h5; \ + h[6] = (sc)->h6; \ + h[7] = (sc)->h7; \ + bcount = sc->bcount; \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + (sc)->h0 = h[0]; \ + (sc)->h1 = h[1]; \ + (sc)->h2 = h[2]; \ + (sc)->h3 = h[3]; \ + (sc)->h4 = h[4]; \ + (sc)->h5 = h[5]; \ + (sc)->h6 = h[6]; \ + (sc)->h7 = h[7]; \ + sc->bcount = bcount; \ + } while (0) + +#else + +#define DECL_STATE_BIG \ + sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; \ + sph_u64 bcount; + +#define READ_STATE_BIG(sc) do { \ + h0 = (sc)->h0; \ + h1 = (sc)->h1; \ + h2 = (sc)->h2; \ + h3 = (sc)->h3; \ + h4 = (sc)->h4; \ + h5 = (sc)->h5; \ + h6 = (sc)->h6; \ + h7 = (sc)->h7; \ + bcount = sc->bcount; \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + (sc)->h0 = h0; \ + (sc)->h1 = h1; \ + (sc)->h2 = h2; \ + (sc)->h3 = h3; \ + (sc)->h4 = h4; \ + (sc)->h5 = h5; \ + (sc)->h6 = h6; \ + (sc)->h7 = h7; \ + sc->bcount = bcount; \ + } while (0) + +#endif + +#if 0 +/* obsolete */ +static void +skein_small_init(sph_skein_small_context *sc, const sph_u64 *iv) +{ + sc->h0 = iv[0]; + sc->h1 = iv[1]; + sc->h2 = iv[2]; + sc->h3 = iv[3]; + sc->bcount = 0; + sc->ptr = 0; +} +#endif + +static void +skein_big_init(sph_skein_big_context *sc, const sph_u64 *iv) +{ + sc->h0 = iv[0]; + sc->h1 = iv[1]; + sc->h2 = iv[2]; + sc->h3 = iv[3]; + sc->h4 = iv[4]; + sc->h5 = iv[5]; + sc->h6 = iv[6]; + sc->h7 = iv[7]; + sc->bcount = 0; + sc->ptr = 0; +} + +#if 0 +/* obsolete */ +static void +skein_small_core(sph_skein_small_context *sc, const void *data, size_t len) +{ + unsigned char *buf; + size_t ptr, clen; + unsigned first; + DECL_STATE_SMALL + + buf = sc->buf; + ptr = sc->ptr; + clen = (sizeof sc->buf) - ptr; + if (len <= clen) { + memcpy(buf + ptr, data, len); + sc->ptr = ptr + len; + return; + } + if (clen != 0) { + memcpy(buf + ptr, data, clen); + data = (const unsigned char *)data + clen; + len -= clen; + } + +#if SPH_SMALL_FOOTPRINT_SKEIN + + READ_STATE_SMALL(sc); + first = (bcount == 0) << 7; + for (;;) { + bcount ++; + UBI_SMALL(96 + first, 0); + if (len <= sizeof sc->buf) + break; + first = 0; + memcpy(buf, data, sizeof sc->buf); + data = (const unsigned char *)data + sizeof sc->buf; + len -= sizeof sc->buf; + } + WRITE_STATE_SMALL(sc); + sc->ptr = len; + memcpy(buf, data, len); + +#else + + /* + * Unrolling the loop yields a slight performance boost, while + * keeping the code size aorund 24 kB on 32-bit x86. + */ + READ_STATE_SMALL(sc); + first = (bcount == 0) << 7; + for (;;) { + bcount ++; + UBI_SMALL(96 + first, 0); + if (len <= sizeof sc->buf) + break; + buf = (unsigned char *)data; + bcount ++; + UBI_SMALL(96, 0); + if (len <= 2 * sizeof sc->buf) { + data = buf + sizeof sc->buf; + len -= sizeof sc->buf; + break; + } + buf += sizeof sc->buf; + data = buf + sizeof sc->buf; + first = 0; + len -= 2 * sizeof sc->buf; + } + WRITE_STATE_SMALL(sc); + sc->ptr = len; + memcpy(sc->buf, data, len); + +#endif +} +#endif + +static void +skein_big_core(sph_skein_big_context *sc, const void *data, size_t len) +{ + /* + * The Skein "final bit" in the tweak is troublesome here, + * because if the input has a length which is a multiple of the + * block size (512 bits) then that bit must be set for the + * final block, which is full of message bits (padding in + * Skein can be reduced to no extra bit at all). However, this + * function cannot know whether it processes the last chunks of + * the message or not. Hence we may keep a full block of buffered + * data (64 bytes). + */ + unsigned char *buf; + size_t ptr; + unsigned first; + DECL_STATE_BIG + + buf = sc->buf; + ptr = sc->ptr; + if (len <= (sizeof sc->buf) - ptr) { + memcpy(buf + ptr, data, len); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE_BIG(sc); + first = (bcount == 0) << 7; + do { + size_t clen; + + if (ptr == sizeof sc->buf) { + bcount ++; + UBI_BIG(96 + first, 0); + first = 0; + ptr = 0; + } + clen = (sizeof sc->buf) - ptr; + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + } while (len > 0); + WRITE_STATE_BIG(sc); + sc->ptr = ptr; +} + +#if 0 +/* obsolete */ +static void +skein_small_close(sph_skein_small_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_len) +{ + unsigned char *buf; + size_t ptr; + unsigned et; + int i; + DECL_STATE_SMALL + + if (n != 0) { + unsigned z; + unsigned char x; + + z = 0x80 >> n; + x = ((ub & -z) | z) & 0xFF; + skein_small_core(sc, &x, 1); + } + + buf = sc->buf; + ptr = sc->ptr; + READ_STATE_SMALL(sc); + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + et = 352 + ((bcount == 0) << 7) + (n != 0); + for (i = 0; i < 2; i ++) { + UBI_SMALL(et, ptr); + if (i == 0) { + memset(buf, 0, sizeof sc->buf); + bcount = 0; + et = 510; + ptr = 8; + } + } + + sph_enc64le_aligned(buf + 0, h0); + sph_enc64le_aligned(buf + 8, h1); + sph_enc64le_aligned(buf + 16, h2); + sph_enc64le_aligned(buf + 24, h3); + memcpy(dst, buf, out_len); +} +#endif + +static void +skein_big_close(sph_skein_big_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_len) +{ + unsigned char *buf; + size_t ptr; + unsigned et; + int i; +#if SPH_SMALL_FOOTPRINT_SKEIN + size_t u; +#endif + DECL_STATE_BIG + + /* + * Add bit padding if necessary. + */ + if (n != 0) { + unsigned z; + unsigned char x; + + z = 0x80 >> n; + x = ((ub & -z) | z) & 0xFF; + skein_big_core(sc, &x, 1); + } + + buf = sc->buf; + ptr = sc->ptr; + + /* + * At that point, if ptr == 0, then the message was empty; + * otherwise, there is between 1 and 64 bytes (inclusive) which + * are yet to be processed. Either way, we complete the buffer + * to a full block with zeros (the Skein specification mandates + * that an empty message is padded so that there is at least + * one block to process). + * + * Once this block has been processed, we do it again, with + * a block full of zeros, for the output (that block contains + * the encoding of "0", over 8 bytes, then padded with zeros). + */ + READ_STATE_BIG(sc); + memset(buf + ptr, 0, (sizeof sc->buf) - ptr); + et = 352 + ((bcount == 0) << 7) + (n != 0); + for (i = 0; i < 2; i ++) { + UBI_BIG(et, ptr); + if (i == 0) { + memset(buf, 0, sizeof sc->buf); + bcount = 0; + et = 510; + ptr = 8; + } + } + +#if SPH_SMALL_FOOTPRINT_SKEIN + + /* + * We use a temporary buffer because we must support the case + * where output size is not a multiple of 64 (namely, a 224-bit + * output). + */ + for (u = 0; u < out_len; u += 8) + sph_enc64le_aligned(buf + u, h[u >> 3]); + memcpy(dst, buf, out_len); + +#else + + sph_enc64le_aligned(buf + 0, h0); + sph_enc64le_aligned(buf + 8, h1); + sph_enc64le_aligned(buf + 16, h2); + sph_enc64le_aligned(buf + 24, h3); + sph_enc64le_aligned(buf + 32, h4); + sph_enc64le_aligned(buf + 40, h5); + sph_enc64le_aligned(buf + 48, h6); + sph_enc64le_aligned(buf + 56, h7); + memcpy(dst, buf, out_len); + +#endif +} + +#if 0 +/* obsolete */ +static const sph_u64 IV224[] = { + SPH_C64(0xC6098A8C9AE5EA0B), SPH_C64(0x876D568608C5191C), + SPH_C64(0x99CB88D7D7F53884), SPH_C64(0x384BDDB1AEDDB5DE) +}; + +static const sph_u64 IV256[] = { + SPH_C64(0xFC9DA860D048B449), SPH_C64(0x2FCA66479FA7D833), + SPH_C64(0xB33BC3896656840F), SPH_C64(0x6A54E920FDE8DA69) +}; +#endif + +static const sph_u64 IV224[] = { + SPH_C64(0xCCD0616248677224), SPH_C64(0xCBA65CF3A92339EF), + SPH_C64(0x8CCD69D652FF4B64), SPH_C64(0x398AED7B3AB890B4), + SPH_C64(0x0F59D1B1457D2BD0), SPH_C64(0x6776FE6575D4EB3D), + SPH_C64(0x99FBC70E997413E9), SPH_C64(0x9E2CFCCFE1C41EF7) +}; + +static const sph_u64 IV256[] = { + SPH_C64(0xCCD044A12FDB3E13), SPH_C64(0xE83590301A79A9EB), + SPH_C64(0x55AEA0614F816E6F), SPH_C64(0x2A2767A4AE9B94DB), + SPH_C64(0xEC06025E74DD7683), SPH_C64(0xE7A436CDC4746251), + SPH_C64(0xC36FBAF9393AD185), SPH_C64(0x3EEDBA1833EDFC13) +}; + +static const sph_u64 IV384[] = { + SPH_C64(0xA3F6C6BF3A75EF5F), SPH_C64(0xB0FEF9CCFD84FAA4), + SPH_C64(0x9D77DD663D770CFE), SPH_C64(0xD798CBF3B468FDDA), + SPH_C64(0x1BC4A6668A0E4465), SPH_C64(0x7ED7D434E5807407), + SPH_C64(0x548FC1ACD4EC44D6), SPH_C64(0x266E17546AA18FF8) +}; + +static const sph_u64 IV512[] = { + SPH_C64(0x4903ADFF749C51CE), SPH_C64(0x0D95DE399746DF03), + SPH_C64(0x8FD1934127C79BCE), SPH_C64(0x9A255629FF352CB1), + SPH_C64(0x5DB62599DF6CA7B0), SPH_C64(0xEABE394CA9D5C3F4), + SPH_C64(0x991112C71A75B523), SPH_C64(0xAE18A40B660FCC33) +}; + +#if 0 +/* obsolete */ +/* see sph_skein.h */ +void +sph_skein224_init(void *cc) +{ + skein_small_init(cc, IV224); +} + +/* see sph_skein.h */ +void +sph_skein224(void *cc, const void *data, size_t len) +{ + skein_small_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein224_close(void *cc, void *dst) +{ + sph_skein224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_small_close(cc, ub, n, dst, 28); + sph_skein224_init(cc); +} + +/* see sph_skein.h */ +void +sph_skein256_init(void *cc) +{ + skein_small_init(cc, IV256); +} + +/* see sph_skein.h */ +void +sph_skein256(void *cc, const void *data, size_t len) +{ + skein_small_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein256_close(void *cc, void *dst) +{ + sph_skein256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_small_close(cc, ub, n, dst, 32); + sph_skein256_init(cc); +} +#endif + +/* see sph_skein.h */ +void +sph_skein224_init(void *cc) +{ + skein_big_init(cc, IV224); +} + +/* see sph_skein.h */ +void +sph_skein224(void *cc, const void *data, size_t len) +{ + skein_big_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein224_close(void *cc, void *dst) +{ + sph_skein224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_big_close(cc, ub, n, dst, 28); + sph_skein224_init(cc); +} + +/* see sph_skein.h */ +void +sph_skein256_init(void *cc) +{ + skein_big_init(cc, IV256); +} + +/* see sph_skein.h */ +void +sph_skein256(void *cc, const void *data, size_t len) +{ + skein_big_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein256_close(void *cc, void *dst) +{ + sph_skein256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_big_close(cc, ub, n, dst, 32); + sph_skein256_init(cc); +} + +/* see sph_skein.h */ +void +sph_skein384_init(void *cc) +{ + skein_big_init(cc, IV384); +} + +/* see sph_skein.h */ +void +sph_skein384(void *cc, const void *data, size_t len) +{ + skein_big_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein384_close(void *cc, void *dst) +{ + sph_skein384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_big_close(cc, ub, n, dst, 48); + sph_skein384_init(cc); +} + +/* see sph_skein.h */ +void +sph_skein512_init(void *cc) +{ + skein_big_init(cc, IV512); +} + +/* see sph_skein.h */ +void +sph_skein512(void *cc, const void *data, size_t len) +{ + skein_big_core(cc, data, len); +} + +/* see sph_skein.h */ +void +sph_skein512_close(void *cc, void *dst) +{ + sph_skein512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_skein.h */ +void +sph_skein512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + skein_big_close(cc, ub, n, dst, 64); + sph_skein512_init(cc); +} + +#endif + + +#ifdef __cplusplus +} +#endif diff --git a/sha3/sph_skein.h b/sha3/sph_skein.h new file mode 100644 index 00000000..bddbc86f --- /dev/null +++ b/sha3/sph_skein.h @@ -0,0 +1,298 @@ +/* $Id: sph_skein.h 253 2011-06-07 18:33:10Z tp $ */ +/** + * Skein interface. The Skein specification defines three main + * functions, called Skein-256, Skein-512 and Skein-1024, which can be + * further parameterized with an output length. For the SHA-3 + * competition, Skein-512 is used for output sizes of 224, 256, 384 and + * 512 bits; this is what this code implements. Thus, we hereafter call + * Skein-224, Skein-256, Skein-384 and Skein-512 what the Skein + * specification defines as Skein-512-224, Skein-512-256, Skein-512-384 + * and Skein-512-512, respectively. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_skein.h + * @author Thomas Pornin + */ + +#ifndef SPH_SKEIN_H__ +#define SPH_SKEIN_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +#if SPH_64 + +/** + * Output size (in bits) for Skein-224. + */ +#define SPH_SIZE_skein224 224 + +/** + * Output size (in bits) for Skein-256. + */ +#define SPH_SIZE_skein256 256 + +/** + * Output size (in bits) for Skein-384. + */ +#define SPH_SIZE_skein384 384 + +/** + * Output size (in bits) for Skein-512. + */ +#define SPH_SIZE_skein512 512 + +/** + * This structure is a context for Skein computations (with a 384- or + * 512-bit output): it contains the intermediate values and some data + * from the last entered block. Once a Skein computation has been + * performed, the context can be reused for another computation. + * + * The contents of this structure are private. A running Skein computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + size_t ptr; + sph_u64 h0, h1, h2, h3, h4, h5, h6, h7; + sph_u64 bcount; +#endif +} sph_skein_big_context; + +/** + * Type for a Skein-224 context (identical to the common "big" context). + */ +typedef sph_skein_big_context sph_skein224_context; + +/** + * Type for a Skein-256 context (identical to the common "big" context). + */ +typedef sph_skein_big_context sph_skein256_context; + +/** + * Type for a Skein-384 context (identical to the common "big" context). + */ +typedef sph_skein_big_context sph_skein384_context; + +/** + * Type for a Skein-512 context (identical to the common "big" context). + */ +typedef sph_skein_big_context sph_skein512_context; + +/** + * Initialize a Skein-224 context. This process performs no memory allocation. + * + * @param cc the Skein-224 context (pointer to a + * sph_skein224_context) + */ +void sph_skein224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Skein-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_skein224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Skein-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Skein-224 context + * @param dst the destination buffer + */ +void sph_skein224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Skein-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_skein224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Skein-256 context. This process performs no memory allocation. + * + * @param cc the Skein-256 context (pointer to a + * sph_skein256_context) + */ +void sph_skein256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Skein-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_skein256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Skein-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Skein-256 context + * @param dst the destination buffer + */ +void sph_skein256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Skein-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_skein256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Skein-384 context. This process performs no memory allocation. + * + * @param cc the Skein-384 context (pointer to a + * sph_skein384_context) + */ +void sph_skein384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Skein-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_skein384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Skein-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Skein-384 context + * @param dst the destination buffer + */ +void sph_skein384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Skein-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_skein384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Skein-512 context. This process performs no memory allocation. + * + * @param cc the Skein-512 context (pointer to a + * sph_skein512_context) + */ +void sph_skein512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Skein-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_skein512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Skein-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Skein-512 context + * @param dst the destination buffer + */ +void sph_skein512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Skein-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_skein512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sha3/sph_types.h b/sha3/sph_types.h new file mode 100644 index 00000000..7295b0b3 --- /dev/null +++ b/sha3/sph_types.h @@ -0,0 +1,1976 @@ +/* $Id: sph_types.h 260 2011-07-21 01:02:38Z tp $ */ +/** + * Basic type definitions. + * + * This header file defines the generic integer types that will be used + * for the implementation of hash functions; it also contains helper + * functions which encode and decode multi-byte integer values, using + * either little-endian or big-endian conventions. + * + * This file contains a compile-time test on the size of a byte + * (the unsigned char C type). If bytes are not octets, + * i.e. if they do not have a size of exactly 8 bits, then compilation + * is aborted. Architectures where bytes are not octets are relatively + * rare, even in the embedded devices market. We forbid non-octet bytes + * because there is no clear convention on how octet streams are encoded + * on such systems. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_types.h + * @author Thomas Pornin + */ + +#ifndef SPH_TYPES_H__ +#define SPH_TYPES_H__ + +#include + +/* + * All our I/O functions are defined over octet streams. We do not know + * how to handle input data if bytes are not octets. + */ +#if CHAR_BIT != 8 +#error This code requires 8-bit bytes +#endif + +/* ============= BEGIN documentation block for Doxygen ============ */ + +#ifdef DOXYGEN_IGNORE + +/** @mainpage sphlib C code documentation + * + * @section overview Overview + * + * sphlib is a library which contains implementations of + * various cryptographic hash functions. These pages have been generated + * with doxygen and + * document the API for the C implementations. + * + * The API is described in appropriate header files, which are available + * in the "Files" section. Each hash function family has its own header, + * whose name begins with "sph_" and contains the family + * name. For instance, the API for the RIPEMD hash functions is available + * in the header file sph_ripemd.h. + * + * @section principles API structure and conventions + * + * @subsection io Input/output conventions + * + * In all generality, hash functions operate over strings of bits. + * Individual bits are rarely encountered in C programming or actual + * communication protocols; most protocols converge on the ubiquitous + * "octet" which is a group of eight bits. Data is thus expressed as a + * stream of octets. The C programming language contains the notion of a + * "byte", which is a data unit managed under the type "unsigned + * char". The C standard prescribes that a byte should hold at + * least eight bits, but possibly more. Most modern architectures, even + * in the embedded world, feature eight-bit bytes, i.e. map bytes to + * octets. + * + * Nevertheless, for some of the implemented hash functions, an extra + * API has been added, which allows the input of arbitrary sequences of + * bits: when the computation is about to be closed, 1 to 7 extra bits + * can be added. The functions for which this API is implemented include + * the SHA-2 functions and all SHA-3 candidates. + * + * sphlib defines hash function which may hash octet streams, + * i.e. streams of bits where the number of bits is a multiple of eight. + * The data input functions in the sphlib API expect data + * as anonymous pointers ("const void *") with a length + * (of type "size_t") which gives the input data chunk length + * in bytes. A byte is assumed to be an octet; the sph_types.h + * header contains a compile-time test which prevents compilation on + * architectures where this property is not met. + * + * The hash function output is also converted into bytes. All currently + * implemented hash functions have an output width which is a multiple of + * eight, and this is likely to remain true for new designs. + * + * Most hash functions internally convert input data into 32-bit of 64-bit + * words, using either little-endian or big-endian conversion. The hash + * output also often consists of such words, which are encoded into output + * bytes with a similar endianness convention. Some hash functions have + * been only loosely specified on that subject; when necessary, + * sphlib has been tested against published "reference" + * implementations in order to use the same conventions. + * + * @subsection shortname Function short name + * + * Each implemented hash function has a "short name" which is used + * internally to derive the identifiers for the functions and context + * structures which the function uses. For instance, MD5 has the short + * name "md5". Short names are listed in the next section, + * for the implemented hash functions. In subsequent sections, the + * short name will be assumed to be "XXX": replace with the + * actual hash function name to get the C identifier. + * + * Note: some functions within the same family share the same core + * elements, such as update function or context structure. Correspondingly, + * some of the defined types or functions may actually be macros which + * transparently evaluate to another type or function name. + * + * @subsection context Context structure + * + * Each implemented hash fonction has its own context structure, available + * under the type name "sph_XXX_context" for the hash function + * with short name "XXX". This structure holds all needed + * state for a running hash computation. + * + * The contents of these structures are meant to be opaque, and private + * to the implementation. However, these contents are specified in the + * header files so that application code which uses sphlib + * may access the size of those structures. + * + * The caller is responsible for allocating the context structure, + * whether by dynamic allocation (malloc() or equivalent), + * static allocation (a global permanent variable), as an automatic + * variable ("on the stack"), or by any other mean which ensures proper + * structure alignment. sphlib code performs no dynamic + * allocation by itself. + * + * The context must be initialized before use, using the + * sph_XXX_init() function. This function sets the context + * state to proper initial values for hashing. + * + * Since all state data is contained within the context structure, + * sphlib is thread-safe and reentrant: several hash + * computations may be performed in parallel, provided that they do not + * operate on the same context. Moreover, a running computation can be + * cloned by copying the context (with a simple memcpy()): + * the context and its clone are then independant and may be updated + * with new data and/or closed without interfering with each other. + * Similarly, a context structure can be moved in memory at will: + * context structures contain no pointer, in particular no pointer to + * themselves. + * + * @subsection dataio Data input + * + * Hashed data is input with the sph_XXX() fonction, which + * takes as parameters a pointer to the context, a pointer to the data + * to hash, and the number of data bytes to hash. The context is updated + * with the new data. + * + * Data can be input in one or several calls, with arbitrary input lengths. + * However, it is best, performance wise, to input data by relatively big + * chunks (say a few kilobytes), because this allows sphlib to + * optimize things and avoid internal copying. + * + * When all data has been input, the context can be closed with + * sph_XXX_close(). The hash output is computed and written + * into the provided buffer. The caller must take care to provide a + * buffer of appropriate length; e.g., when using SHA-1, the output is + * a 20-byte word, therefore the output buffer must be at least 20-byte + * long. + * + * For some hash functions, the sph_XXX_addbits_and_close() + * function can be used instead of sph_XXX_close(). This + * function can take a few extra bits to be added at + * the end of the input message. This allows hashing messages with a + * bit length which is not a multiple of 8. The extra bits are provided + * as an unsigned integer value, and a bit count. The bit count must be + * between 0 and 7, inclusive. The extra bits are provided as bits 7 to + * 0 (bits of numerical value 128, 64, 32... downto 0), in that order. + * For instance, to add three bits of value 1, 1 and 0, the unsigned + * integer will have value 192 (1*128 + 1*64 + 0*32) and the bit count + * will be 3. + * + * The SPH_SIZE_XXX macro is defined for each hash function; + * it evaluates to the function output size, expressed in bits. For instance, + * SPH_SIZE_sha1 evaluates to 160. + * + * When closed, the context is automatically reinitialized and can be + * immediately used for another computation. It is not necessary to call + * sph_XXX_init() after a close. Note that + * sph_XXX_init() can still be called to "reset" a context, + * i.e. forget previously input data, and get back to the initial state. + * + * @subsection alignment Data alignment + * + * "Alignment" is a property of data, which is said to be "properly + * aligned" when its emplacement in memory is such that the data can + * be optimally read by full words. This depends on the type of access; + * basically, some hash functions will read data by 32-bit or 64-bit + * words. sphlib does not mandate such alignment for input + * data, but using aligned data can substantially improve performance. + * + * As a rule, it is best to input data by chunks whose length (in bytes) + * is a multiple of eight, and which begins at "generally aligned" + * addresses, such as the base address returned by a call to + * malloc(). + * + * @section functions Implemented functions + * + * We give here the list of implemented functions. They are grouped by + * family; to each family corresponds a specific header file. Each + * individual function has its associated "short name". Please refer to + * the documentation for that header file to get details on the hash + * function denomination and provenance. + * + * Note: the functions marked with a '(64)' in the list below are + * available only if the C compiler provides an integer type of length + * 64 bits or more. Such a type is mandatory in the latest C standard + * (ISO 9899:1999, aka "C99") and is present in several older compilers + * as well, so chances are that such a type is available. + * + * - HAVAL family: file sph_haval.h + * - HAVAL-128/3 (128-bit, 3 passes): short name: haval128_3 + * - HAVAL-128/4 (128-bit, 4 passes): short name: haval128_4 + * - HAVAL-128/5 (128-bit, 5 passes): short name: haval128_5 + * - HAVAL-160/3 (160-bit, 3 passes): short name: haval160_3 + * - HAVAL-160/4 (160-bit, 4 passes): short name: haval160_4 + * - HAVAL-160/5 (160-bit, 5 passes): short name: haval160_5 + * - HAVAL-192/3 (192-bit, 3 passes): short name: haval192_3 + * - HAVAL-192/4 (192-bit, 4 passes): short name: haval192_4 + * - HAVAL-192/5 (192-bit, 5 passes): short name: haval192_5 + * - HAVAL-224/3 (224-bit, 3 passes): short name: haval224_3 + * - HAVAL-224/4 (224-bit, 4 passes): short name: haval224_4 + * - HAVAL-224/5 (224-bit, 5 passes): short name: haval224_5 + * - HAVAL-256/3 (256-bit, 3 passes): short name: haval256_3 + * - HAVAL-256/4 (256-bit, 4 passes): short name: haval256_4 + * - HAVAL-256/5 (256-bit, 5 passes): short name: haval256_5 + * - MD2: file sph_md2.h, short name: md2 + * - MD4: file sph_md4.h, short name: md4 + * - MD5: file sph_md5.h, short name: md5 + * - PANAMA: file sph_panama.h, short name: panama + * - RadioGatun family: file sph_radiogatun.h + * - RadioGatun[32]: short name: radiogatun32 + * - RadioGatun[64]: short name: radiogatun64 (64) + * - RIPEMD family: file sph_ripemd.h + * - RIPEMD: short name: ripemd + * - RIPEMD-128: short name: ripemd128 + * - RIPEMD-160: short name: ripemd160 + * - SHA-0: file sph_sha0.h, short name: sha0 + * - SHA-1: file sph_sha1.h, short name: sha1 + * - SHA-2 family, 32-bit hashes: file sph_sha2.h + * - SHA-224: short name: sha224 + * - SHA-256: short name: sha256 + * - SHA-384: short name: sha384 (64) + * - SHA-512: short name: sha512 (64) + * - Tiger family: file sph_tiger.h + * - Tiger: short name: tiger (64) + * - Tiger2: short name: tiger2 (64) + * - WHIRLPOOL family: file sph_whirlpool.h + * - WHIRLPOOL-0: short name: whirlpool0 (64) + * - WHIRLPOOL-1: short name: whirlpool1 (64) + * - WHIRLPOOL: short name: whirlpool (64) + * + * The fourteen second-round SHA-3 candidates are also implemented; + * when applicable, the implementations follow the "final" specifications + * as published for the third round of the SHA-3 competition (BLAKE, + * Groestl, JH, Keccak and Skein have been tweaked for third round). + * + * - BLAKE family: file sph_blake.h + * - BLAKE-224: short name: blake224 + * - BLAKE-256: short name: blake256 + * - BLAKE-384: short name: blake384 + * - BLAKE-512: short name: blake512 + * - BMW (Blue Midnight Wish) family: file sph_bmw.h + * - BMW-224: short name: bmw224 + * - BMW-256: short name: bmw256 + * - BMW-384: short name: bmw384 (64) + * - BMW-512: short name: bmw512 (64) + * - CubeHash family: file sph_cubehash.h (specified as + * CubeHash16/32 in the CubeHash specification) + * - CubeHash-224: short name: cubehash224 + * - CubeHash-256: short name: cubehash256 + * - CubeHash-384: short name: cubehash384 + * - CubeHash-512: short name: cubehash512 + * - ECHO family: file sph_echo.h + * - ECHO-224: short name: echo224 + * - ECHO-256: short name: echo256 + * - ECHO-384: short name: echo384 + * - ECHO-512: short name: echo512 + * - Fugue family: file sph_fugue.h + * - Fugue-224: short name: fugue224 + * - Fugue-256: short name: fugue256 + * - Fugue-384: short name: fugue384 + * - Fugue-512: short name: fugue512 + * - Groestl family: file sph_groestl.h + * - Groestl-224: short name: groestl224 + * - Groestl-256: short name: groestl256 + * - Groestl-384: short name: groestl384 + * - Groestl-512: short name: groestl512 + * - Hamsi family: file sph_hamsi.h + * - Hamsi-224: short name: hamsi224 + * - Hamsi-256: short name: hamsi256 + * - Hamsi-384: short name: hamsi384 + * - Hamsi-512: short name: hamsi512 + * - JH family: file sph_jh.h + * - JH-224: short name: jh224 + * - JH-256: short name: jh256 + * - JH-384: short name: jh384 + * - JH-512: short name: jh512 + * - Keccak family: file sph_keccak.h + * - Keccak-224: short name: keccak224 + * - Keccak-256: short name: keccak256 + * - Keccak-384: short name: keccak384 + * - Keccak-512: short name: keccak512 + * - Luffa family: file sph_luffa.h + * - Luffa-224: short name: luffa224 + * - Luffa-256: short name: luffa256 + * - Luffa-384: short name: luffa384 + * - Luffa-512: short name: luffa512 + * - Shabal family: file sph_shabal.h + * - Shabal-192: short name: shabal192 + * - Shabal-224: short name: shabal224 + * - Shabal-256: short name: shabal256 + * - Shabal-384: short name: shabal384 + * - Shabal-512: short name: shabal512 + * - SHAvite-3 family: file sph_shavite.h + * - SHAvite-224 (nominally "SHAvite-3 with 224-bit output"): + * short name: shabal224 + * - SHAvite-256 (nominally "SHAvite-3 with 256-bit output"): + * short name: shabal256 + * - SHAvite-384 (nominally "SHAvite-3 with 384-bit output"): + * short name: shabal384 + * - SHAvite-512 (nominally "SHAvite-3 with 512-bit output"): + * short name: shabal512 + * - SIMD family: file sph_simd.h + * - SIMD-224: short name: simd224 + * - SIMD-256: short name: simd256 + * - SIMD-384: short name: simd384 + * - SIMD-512: short name: simd512 + * - Skein family: file sph_skein.h + * - Skein-224 (nominally specified as Skein-512-224): short name: + * skein224 (64) + * - Skein-256 (nominally specified as Skein-512-256): short name: + * skein256 (64) + * - Skein-384 (nominally specified as Skein-512-384): short name: + * skein384 (64) + * - Skein-512 (nominally specified as Skein-512-512): short name: + * skein512 (64) + * + * For the second-round SHA-3 candidates, the functions are as specified + * for round 2, i.e. with the "tweaks" that some candidates added + * between round 1 and round 2. Also, some of the submitted packages for + * round 2 contained errors, in the specification, reference code, or + * both. sphlib implements the corrected versions. + */ + +/** @hideinitializer + * Unsigned integer type whose length is at least 32 bits; on most + * architectures, it will have a width of exactly 32 bits. Unsigned C + * types implement arithmetics modulo a power of 2; use the + * SPH_T32() macro to ensure that the value is truncated + * to exactly 32 bits. Unless otherwise specified, all macros and + * functions which accept sph_u32 values assume that these + * values fit on 32 bits, i.e. do not exceed 2^32-1, even on architectures + * where sph_u32 is larger than that. + */ +typedef __arch_dependant__ sph_u32; + +/** @hideinitializer + * Signed integer type corresponding to sph_u32; it has + * width 32 bits or more. + */ +typedef __arch_dependant__ sph_s32; + +/** @hideinitializer + * Unsigned integer type whose length is at least 64 bits; on most + * architectures which feature such a type, it will have a width of + * exactly 64 bits. C99-compliant platform will have this type; it + * is also defined when the GNU compiler (gcc) is used, and on + * platforms where unsigned long is large enough. If this + * type is not available, then some hash functions which depends on + * a 64-bit type will not be available (most notably SHA-384, SHA-512, + * Tiger and WHIRLPOOL). + */ +typedef __arch_dependant__ sph_u64; + +/** @hideinitializer + * Signed integer type corresponding to sph_u64; it has + * width 64 bits or more. + */ +typedef __arch_dependant__ sph_s64; + +/** + * This macro expands the token x into a suitable + * constant expression of type sph_u32. Depending on + * how this type is defined, a suffix such as UL may + * be appended to the argument. + * + * @param x the token to expand into a suitable constant expression + */ +#define SPH_C32(x) + +/** + * Truncate a 32-bit value to exactly 32 bits. On most systems, this is + * a no-op, recognized as such by the compiler. + * + * @param x the value to truncate (of type sph_u32) + */ +#define SPH_T32(x) + +/** + * Rotate a 32-bit value by a number of bits to the left. The rotate + * count must reside between 1 and 31. This macro assumes that its + * first argument fits in 32 bits (no extra bit allowed on machines where + * sph_u32 is wider); both arguments may be evaluated + * several times. + * + * @param x the value to rotate (of type sph_u32) + * @param n the rotation count (between 1 and 31, inclusive) + */ +#define SPH_ROTL32(x, n) + +/** + * Rotate a 32-bit value by a number of bits to the left. The rotate + * count must reside between 1 and 31. This macro assumes that its + * first argument fits in 32 bits (no extra bit allowed on machines where + * sph_u32 is wider); both arguments may be evaluated + * several times. + * + * @param x the value to rotate (of type sph_u32) + * @param n the rotation count (between 1 and 31, inclusive) + */ +#define SPH_ROTR32(x, n) + +/** + * This macro is defined on systems for which a 64-bit type has been + * detected, and is used for sph_u64. + */ +#define SPH_64 + +/** + * This macro is defined on systems for the "native" integer size is + * 64 bits (64-bit values fit in one register). + */ +#define SPH_64_TRUE + +/** + * This macro expands the token x into a suitable + * constant expression of type sph_u64. Depending on + * how this type is defined, a suffix such as ULL may + * be appended to the argument. This macro is defined only if a + * 64-bit type was detected and used for sph_u64. + * + * @param x the token to expand into a suitable constant expression + */ +#define SPH_C64(x) + +/** + * Truncate a 64-bit value to exactly 64 bits. On most systems, this is + * a no-op, recognized as such by the compiler. This macro is defined only + * if a 64-bit type was detected and used for sph_u64. + * + * @param x the value to truncate (of type sph_u64) + */ +#define SPH_T64(x) + +/** + * Rotate a 64-bit value by a number of bits to the left. The rotate + * count must reside between 1 and 63. This macro assumes that its + * first argument fits in 64 bits (no extra bit allowed on machines where + * sph_u64 is wider); both arguments may be evaluated + * several times. This macro is defined only if a 64-bit type was detected + * and used for sph_u64. + * + * @param x the value to rotate (of type sph_u64) + * @param n the rotation count (between 1 and 63, inclusive) + */ +#define SPH_ROTL64(x, n) + +/** + * Rotate a 64-bit value by a number of bits to the left. The rotate + * count must reside between 1 and 63. This macro assumes that its + * first argument fits in 64 bits (no extra bit allowed on machines where + * sph_u64 is wider); both arguments may be evaluated + * several times. This macro is defined only if a 64-bit type was detected + * and used for sph_u64. + * + * @param x the value to rotate (of type sph_u64) + * @param n the rotation count (between 1 and 63, inclusive) + */ +#define SPH_ROTR64(x, n) + +/** + * This macro evaluates to inline or an equivalent construction, + * if available on the compilation platform, or to nothing otherwise. This + * is used to declare inline functions, for which the compiler should + * endeavour to include the code directly in the caller. Inline functions + * are typically defined in header files as replacement for macros. + */ +#define SPH_INLINE + +/** + * This macro is defined if the platform has been detected as using + * little-endian convention. This implies that the sph_u32 + * type (and the sph_u64 type also, if it is defined) has + * an exact width (i.e. exactly 32-bit, respectively 64-bit). + */ +#define SPH_LITTLE_ENDIAN + +/** + * This macro is defined if the platform has been detected as using + * big-endian convention. This implies that the sph_u32 + * type (and the sph_u64 type also, if it is defined) has + * an exact width (i.e. exactly 32-bit, respectively 64-bit). + */ +#define SPH_BIG_ENDIAN + +/** + * This macro is defined if 32-bit words (and 64-bit words, if defined) + * can be read from and written to memory efficiently in little-endian + * convention. This is the case for little-endian platforms, and also + * for the big-endian platforms which have special little-endian access + * opcodes (e.g. Ultrasparc). + */ +#define SPH_LITTLE_FAST + +/** + * This macro is defined if 32-bit words (and 64-bit words, if defined) + * can be read from and written to memory efficiently in big-endian + * convention. This is the case for little-endian platforms, and also + * for the little-endian platforms which have special big-endian access + * opcodes. + */ +#define SPH_BIG_FAST + +/** + * On some platforms, this macro is defined to an unsigned integer type + * into which pointer values may be cast. The resulting value can then + * be tested for being a multiple of 2, 4 or 8, indicating an aligned + * pointer for, respectively, 16-bit, 32-bit or 64-bit memory accesses. + */ +#define SPH_UPTR + +/** + * When defined, this macro indicates that unaligned memory accesses + * are possible with only a minor penalty, and thus should be prefered + * over strategies which first copy data to an aligned buffer. + */ +#define SPH_UNALIGNED + +/** + * Byte-swap a 32-bit word (i.e. 0x12345678 becomes + * 0x78563412). This is an inline function which resorts + * to inline assembly on some platforms, for better performance. + * + * @param x the 32-bit value to byte-swap + * @return the byte-swapped value + */ +static inline sph_u32 sph_bswap32(sph_u32 x); + +/** + * Byte-swap a 64-bit word. This is an inline function which resorts + * to inline assembly on some platforms, for better performance. This + * function is defined only if a suitable 64-bit type was found for + * sph_u64 + * + * @param x the 64-bit value to byte-swap + * @return the byte-swapped value + */ +static inline sph_u64 sph_bswap64(sph_u64 x); + +/** + * Decode a 16-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). + * + * @param src the source address + * @return the decoded value + */ +static inline unsigned sph_dec16le(const void *src); + +/** + * Encode a 16-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc16le(void *dst, unsigned val); + +/** + * Decode a 16-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). + * + * @param src the source address + * @return the decoded value + */ +static inline unsigned sph_dec16be(const void *src); + +/** + * Encode a 16-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc16be(void *dst, unsigned val); + +/** + * Decode a 32-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u32 sph_dec32le(const void *src); + +/** + * Decode a 32-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). This function assumes that the + * source address is suitably aligned for a direct access, if the platform + * supports such things; it can thus be marginally faster than the generic + * sph_dec32le() function. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u32 sph_dec32le_aligned(const void *src); + +/** + * Encode a 32-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc32le(void *dst, sph_u32 val); + +/** + * Encode a 32-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). This function assumes that the + * destination address is suitably aligned for a direct access, if the + * platform supports such things; it can thus be marginally faster than + * the generic sph_enc32le() function. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc32le_aligned(void *dst, sph_u32 val); + +/** + * Decode a 32-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u32 sph_dec32be(const void *src); + +/** + * Decode a 32-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). This function assumes that the + * source address is suitably aligned for a direct access, if the platform + * supports such things; it can thus be marginally faster than the generic + * sph_dec32be() function. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u32 sph_dec32be_aligned(const void *src); + +/** + * Encode a 32-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc32be(void *dst, sph_u32 val); + +/** + * Encode a 32-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). This function assumes that the + * destination address is suitably aligned for a direct access, if the + * platform supports such things; it can thus be marginally faster than + * the generic sph_enc32be() function. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc32be_aligned(void *dst, sph_u32 val); + +/** + * Decode a 64-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u64 sph_dec64le(const void *src); + +/** + * Decode a 64-bit unsigned value from memory, in little-endian convention + * (least significant byte comes first). This function assumes that the + * source address is suitably aligned for a direct access, if the platform + * supports such things; it can thus be marginally faster than the generic + * sph_dec64le() function. This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u64 sph_dec64le_aligned(const void *src); + +/** + * Encode a 64-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc64le(void *dst, sph_u64 val); + +/** + * Encode a 64-bit unsigned value into memory, in little-endian convention + * (least significant byte comes first). This function assumes that the + * destination address is suitably aligned for a direct access, if the + * platform supports such things; it can thus be marginally faster than + * the generic sph_enc64le() function. This function is defined + * only if a suitable 64-bit type was detected and used for + * sph_u64. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc64le_aligned(void *dst, sph_u64 val); + +/** + * Decode a 64-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u64 sph_dec64be(const void *src); + +/** + * Decode a 64-bit unsigned value from memory, in big-endian convention + * (most significant byte comes first). This function assumes that the + * source address is suitably aligned for a direct access, if the platform + * supports such things; it can thus be marginally faster than the generic + * sph_dec64be() function. This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param src the source address + * @return the decoded value + */ +static inline sph_u64 sph_dec64be_aligned(const void *src); + +/** + * Encode a 64-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). This function is defined only + * if a suitable 64-bit type was detected and used for sph_u64. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc64be(void *dst, sph_u64 val); + +/** + * Encode a 64-bit unsigned value into memory, in big-endian convention + * (most significant byte comes first). This function assumes that the + * destination address is suitably aligned for a direct access, if the + * platform supports such things; it can thus be marginally faster than + * the generic sph_enc64be() function. This function is defined + * only if a suitable 64-bit type was detected and used for + * sph_u64. + * + * @param dst the destination buffer + * @param val the value to encode + */ +static inline void sph_enc64be_aligned(void *dst, sph_u64 val); + +#endif + +/* ============== END documentation block for Doxygen ============= */ + +#ifndef DOXYGEN_IGNORE + +/* + * We want to define the types "sph_u32" and "sph_u64" which hold + * unsigned values of at least, respectively, 32 and 64 bits. These + * tests should select appropriate types for most platforms. The + * macro "SPH_64" is defined if the 64-bit is supported. + */ + +#undef SPH_64 +#undef SPH_64_TRUE + +#if defined __STDC__ && __STDC_VERSION__ >= 199901L + +/* + * On C99 implementations, we can use to get an exact 64-bit + * type, if any, or otherwise use a wider type (which must exist, for + * C99 conformance). + */ + +#include + +#ifdef UINT32_MAX +typedef uint32_t sph_u32; +typedef int32_t sph_s32; +#else +typedef uint_fast32_t sph_u32; +typedef int_fast32_t sph_s32; +#endif +#if !SPH_NO_64 +#ifdef UINT64_MAX +typedef uint64_t sph_u64; +typedef int64_t sph_s64; +#else +typedef uint_fast64_t sph_u64; +typedef int_fast64_t sph_s64; +#endif +#endif + +#define SPH_C32(x) ((sph_u32)(x)) +#if !SPH_NO_64 +#define SPH_C64(x) ((sph_u64)(x)) +#define SPH_64 1 +#endif + +#else + +/* + * On non-C99 systems, we use "unsigned int" if it is wide enough, + * "unsigned long" otherwise. This supports all "reasonable" architectures. + * We have to be cautious: pre-C99 preprocessors handle constants + * differently in '#if' expressions. Hence the shifts to test UINT_MAX. + */ + +#if ((UINT_MAX >> 11) >> 11) >= 0x3FF + +typedef unsigned int sph_u32; +typedef int sph_s32; + +#define SPH_C32(x) ((sph_u32)(x ## U)) + +#else + +typedef unsigned long sph_u32; +typedef long sph_s32; + +#define SPH_C32(x) ((sph_u32)(x ## UL)) + +#endif + +#if !SPH_NO_64 + +/* + * We want a 64-bit type. We use "unsigned long" if it is wide enough (as + * is common on 64-bit architectures such as AMD64, Alpha or Sparcv9), + * "unsigned long long" otherwise, if available. We use ULLONG_MAX to + * test whether "unsigned long long" is available; we also know that + * gcc features this type, even if the libc header do not know it. + */ + +#if ((ULONG_MAX >> 31) >> 31) >= 3 + +typedef unsigned long sph_u64; +typedef long sph_s64; + +#define SPH_C64(x) ((sph_u64)(x ## UL)) + +#define SPH_64 1 + +#elif ((ULLONG_MAX >> 31) >> 31) >= 3 || defined __GNUC__ + +typedef unsigned long long sph_u64; +typedef long long sph_s64; + +#define SPH_C64(x) ((sph_u64)(x ## ULL)) + +#define SPH_64 1 + +#else + +/* + * No 64-bit type... + */ + +#endif + +#endif + +#endif + +/* + * If the "unsigned long" type has length 64 bits or more, then this is + * a "true" 64-bit architectures. This is also true with Visual C on + * amd64, even though the "long" type is limited to 32 bits. + */ +#if SPH_64 && (((ULONG_MAX >> 31) >> 31) >= 3 || defined _M_X64) +#define SPH_64_TRUE 1 +#endif + +/* + * Implementation note: some processors have specific opcodes to perform + * a rotation. Recent versions of gcc recognize the expression above and + * use the relevant opcodes, when appropriate. + */ + +#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) +#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) +#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) + +#if SPH_64 + +#define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) +#define SPH_ROTL64(x, n) SPH_T64(((x) << (n)) | ((x) >> (64 - (n)))) +#define SPH_ROTR64(x, n) SPH_ROTL64(x, (64 - (n))) + +#endif + +#ifndef DOXYGEN_IGNORE +/* + * Define SPH_INLINE to be an "inline" qualifier, if available. We define + * some small macro-like functions which benefit greatly from being inlined. + */ +#if (defined __STDC__ && __STDC_VERSION__ >= 199901L) || defined __GNUC__ +#define SPH_INLINE inline +#elif defined _MSC_VER +#define SPH_INLINE __inline +#else +#define SPH_INLINE +#endif +#endif + +/* + * We define some macros which qualify the architecture. These macros + * may be explicit set externally (e.g. as compiler parameters). The + * code below sets those macros if they are not already defined. + * + * Most macros are boolean, thus evaluate to either zero or non-zero. + * The SPH_UPTR macro is special, in that it evaluates to a C type, + * or is not defined. + * + * SPH_UPTR if defined: unsigned type to cast pointers into + * + * SPH_UNALIGNED non-zero if unaligned accesses are efficient + * SPH_LITTLE_ENDIAN non-zero if architecture is known to be little-endian + * SPH_BIG_ENDIAN non-zero if architecture is known to be big-endian + * SPH_LITTLE_FAST non-zero if little-endian decoding is fast + * SPH_BIG_FAST non-zero if big-endian decoding is fast + * + * If SPH_UPTR is defined, then encoding and decoding of 32-bit and 64-bit + * values will try to be "smart". Either SPH_LITTLE_ENDIAN or SPH_BIG_ENDIAN + * _must_ be non-zero in those situations. The 32-bit and 64-bit types + * _must_ also have an exact width. + * + * SPH_SPARCV9_GCC_32 UltraSPARC-compatible with gcc, 32-bit mode + * SPH_SPARCV9_GCC_64 UltraSPARC-compatible with gcc, 64-bit mode + * SPH_SPARCV9_GCC UltraSPARC-compatible with gcc + * SPH_I386_GCC x86-compatible (32-bit) with gcc + * SPH_I386_MSVC x86-compatible (32-bit) with Microsoft Visual C + * SPH_AMD64_GCC x86-compatible (64-bit) with gcc + * SPH_AMD64_MSVC x86-compatible (64-bit) with Microsoft Visual C + * SPH_PPC32_GCC PowerPC, 32-bit, with gcc + * SPH_PPC64_GCC PowerPC, 64-bit, with gcc + * + * TODO: enhance automatic detection, for more architectures and compilers. + * Endianness is the most important. SPH_UNALIGNED and SPH_UPTR help with + * some very fast functions (e.g. MD4) when using unaligned input data. + * The CPU-specific-with-GCC macros are useful only for inline assembly, + * normally restrained to this header file. + */ + +/* + * 32-bit x86, aka "i386 compatible". + */ +#if defined __i386__ || defined _M_IX86 + +#define SPH_DETECT_UNALIGNED 1 +#define SPH_DETECT_LITTLE_ENDIAN 1 +#define SPH_DETECT_UPTR sph_u32 +#ifdef __GNUC__ +#define SPH_DETECT_I386_GCC 1 +#endif +#ifdef _MSC_VER +#define SPH_DETECT_I386_MSVC 1 +#endif + +/* + * 64-bit x86, hereafter known as "amd64". + */ +#elif defined __x86_64 || defined _M_X64 + +#define SPH_DETECT_UNALIGNED 1 +#define SPH_DETECT_LITTLE_ENDIAN 1 +#define SPH_DETECT_UPTR sph_u64 +#ifdef __GNUC__ +#define SPH_DETECT_AMD64_GCC 1 +#endif +#ifdef _MSC_VER +#define SPH_DETECT_AMD64_MSVC 1 +#endif + +/* + * 64-bit Sparc architecture (implies v9). + */ +#elif ((defined __sparc__ || defined __sparc) && defined __arch64__) \ + || defined __sparcv9 + +#define SPH_DETECT_BIG_ENDIAN 1 +#define SPH_DETECT_UPTR sph_u64 +#ifdef __GNUC__ +#define SPH_DETECT_SPARCV9_GCC_64 1 +#define SPH_DETECT_LITTLE_FAST 1 +#endif + +/* + * 32-bit Sparc. + */ +#elif (defined __sparc__ || defined __sparc) \ + && !(defined __sparcv9 || defined __arch64__) + +#define SPH_DETECT_BIG_ENDIAN 1 +#define SPH_DETECT_UPTR sph_u32 +#if defined __GNUC__ && defined __sparc_v9__ +#define SPH_DETECT_SPARCV9_GCC_32 1 +#define SPH_DETECT_LITTLE_FAST 1 +#endif + +/* + * ARM, little-endian. + */ +#elif defined __arm__ && __ARMEL__ + +#define SPH_DETECT_LITTLE_ENDIAN 1 + +/* + * MIPS, little-endian. + */ +#elif MIPSEL || _MIPSEL || __MIPSEL || __MIPSEL__ + +#define SPH_DETECT_LITTLE_ENDIAN 1 + +/* + * MIPS, big-endian. + */ +#elif MIPSEB || _MIPSEB || __MIPSEB || __MIPSEB__ + +#define SPH_DETECT_BIG_ENDIAN 1 + +/* + * PowerPC. + */ +#elif defined __powerpc__ || defined __POWERPC__ || defined __ppc__ \ + || defined _ARCH_PPC + +/* + * Note: we do not declare cross-endian access to be "fast": even if + * using inline assembly, implementation should still assume that + * keeping the decoded word in a temporary is faster than decoding + * it again. + */ +#if defined __GNUC__ +#if SPH_64_TRUE +#define SPH_DETECT_PPC64_GCC 1 +#else +#define SPH_DETECT_PPC32_GCC 1 +#endif +#endif + +#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN +#define SPH_DETECT_BIG_ENDIAN 1 +#elif defined __LITTLE_ENDIAN__ || defined _LITTLE_ENDIAN +#define SPH_DETECT_LITTLE_ENDIAN 1 +#endif + +/* + * Itanium, 64-bit. + */ +#elif defined __ia64 || defined __ia64__ \ + || defined __itanium__ || defined _M_IA64 + +#if defined __BIG_ENDIAN__ || defined _BIG_ENDIAN +#define SPH_DETECT_BIG_ENDIAN 1 +#else +#define SPH_DETECT_LITTLE_ENDIAN 1 +#endif +#if defined __LP64__ || defined _LP64 +#define SPH_DETECT_UPTR sph_u64 +#else +#define SPH_DETECT_UPTR sph_u32 +#endif + +#endif + +#if defined SPH_DETECT_SPARCV9_GCC_32 || defined SPH_DETECT_SPARCV9_GCC_64 +#define SPH_DETECT_SPARCV9_GCC 1 +#endif + +#if defined SPH_DETECT_UNALIGNED && !defined SPH_UNALIGNED +#define SPH_UNALIGNED SPH_DETECT_UNALIGNED +#endif +#if defined SPH_DETECT_UPTR && !defined SPH_UPTR +#define SPH_UPTR SPH_DETECT_UPTR +#endif +#if defined SPH_DETECT_LITTLE_ENDIAN && !defined SPH_LITTLE_ENDIAN +#define SPH_LITTLE_ENDIAN SPH_DETECT_LITTLE_ENDIAN +#endif +#if defined SPH_DETECT_BIG_ENDIAN && !defined SPH_BIG_ENDIAN +#define SPH_BIG_ENDIAN SPH_DETECT_BIG_ENDIAN +#endif +#if defined SPH_DETECT_LITTLE_FAST && !defined SPH_LITTLE_FAST +#define SPH_LITTLE_FAST SPH_DETECT_LITTLE_FAST +#endif +#if defined SPH_DETECT_BIG_FAST && !defined SPH_BIG_FAST +#define SPH_BIG_FAST SPH_DETECT_BIG_FAST +#endif +#if defined SPH_DETECT_SPARCV9_GCC_32 && !defined SPH_SPARCV9_GCC_32 +#define SPH_SPARCV9_GCC_32 SPH_DETECT_SPARCV9_GCC_32 +#endif +#if defined SPH_DETECT_SPARCV9_GCC_64 && !defined SPH_SPARCV9_GCC_64 +#define SPH_SPARCV9_GCC_64 SPH_DETECT_SPARCV9_GCC_64 +#endif +#if defined SPH_DETECT_SPARCV9_GCC && !defined SPH_SPARCV9_GCC +#define SPH_SPARCV9_GCC SPH_DETECT_SPARCV9_GCC +#endif +#if defined SPH_DETECT_I386_GCC && !defined SPH_I386_GCC +#define SPH_I386_GCC SPH_DETECT_I386_GCC +#endif +#if defined SPH_DETECT_I386_MSVC && !defined SPH_I386_MSVC +#define SPH_I386_MSVC SPH_DETECT_I386_MSVC +#endif +#if defined SPH_DETECT_AMD64_GCC && !defined SPH_AMD64_GCC +#define SPH_AMD64_GCC SPH_DETECT_AMD64_GCC +#endif +#if defined SPH_DETECT_AMD64_MSVC && !defined SPH_AMD64_MSVC +#define SPH_AMD64_MSVC SPH_DETECT_AMD64_MSVC +#endif +#if defined SPH_DETECT_PPC32_GCC && !defined SPH_PPC32_GCC +#define SPH_PPC32_GCC SPH_DETECT_PPC32_GCC +#endif +#if defined SPH_DETECT_PPC64_GCC && !defined SPH_PPC64_GCC +#define SPH_PPC64_GCC SPH_DETECT_PPC64_GCC +#endif + +#if SPH_LITTLE_ENDIAN && !defined SPH_LITTLE_FAST +#define SPH_LITTLE_FAST 1 +#endif +#if SPH_BIG_ENDIAN && !defined SPH_BIG_FAST +#define SPH_BIG_FAST 1 +#endif + +#if defined SPH_UPTR && !(SPH_LITTLE_ENDIAN || SPH_BIG_ENDIAN) +#error SPH_UPTR defined, but endianness is not known. +#endif + +#if SPH_I386_GCC && !SPH_NO_ASM + +/* + * On x86 32-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit + * values. + */ + +static SPH_INLINE sph_u32 +sph_bswap32(sph_u32 x) +{ + __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x)); + return x; +} + +#if SPH_64 + +static SPH_INLINE sph_u64 +sph_bswap64(sph_u64 x) +{ + return ((sph_u64)sph_bswap32((sph_u32)x) << 32) + | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); +} + +#endif + +#elif SPH_AMD64_GCC && !SPH_NO_ASM + +/* + * On x86 64-bit, with gcc, we use the bswapl opcode to byte-swap 32-bit + * and 64-bit values. + */ + +static SPH_INLINE sph_u32 +sph_bswap32(sph_u32 x) +{ + __asm__ __volatile__ ("bswapl %0" : "=r" (x) : "0" (x)); + return x; +} + +#if SPH_64 + +static SPH_INLINE sph_u64 +sph_bswap64(sph_u64 x) +{ + __asm__ __volatile__ ("bswapq %0" : "=r" (x) : "0" (x)); + return x; +} + +#endif + +/* + * Disabled code. Apparently, Microsoft Visual C 2005 is smart enough + * to generate proper opcodes for endianness swapping with the pure C + * implementation below. + * + +#elif SPH_I386_MSVC && !SPH_NO_ASM + +static __inline sph_u32 __declspec(naked) __fastcall +sph_bswap32(sph_u32 x) +{ + __asm { + bswap ecx + mov eax,ecx + ret + } +} + +#if SPH_64 + +static SPH_INLINE sph_u64 +sph_bswap64(sph_u64 x) +{ + return ((sph_u64)sph_bswap32((sph_u32)x) << 32) + | (sph_u64)sph_bswap32((sph_u32)(x >> 32)); +} + +#endif + + * + * [end of disabled code] + */ + +#else + +static SPH_INLINE sph_u32 +sph_bswap32(sph_u32 x) +{ + x = SPH_T32((x << 16) | (x >> 16)); + x = ((x & SPH_C32(0xFF00FF00)) >> 8) + | ((x & SPH_C32(0x00FF00FF)) << 8); + return x; +} + +#if SPH_64 + +/** + * Byte-swap a 64-bit value. + * + * @param x the input value + * @return the byte-swapped value + */ +static SPH_INLINE sph_u64 +sph_bswap64(sph_u64 x) +{ + x = SPH_T64((x << 32) | (x >> 32)); + x = ((x & SPH_C64(0xFFFF0000FFFF0000)) >> 16) + | ((x & SPH_C64(0x0000FFFF0000FFFF)) << 16); + x = ((x & SPH_C64(0xFF00FF00FF00FF00)) >> 8) + | ((x & SPH_C64(0x00FF00FF00FF00FF)) << 8); + return x; +} + +#endif + +#endif + +#if SPH_SPARCV9_GCC && !SPH_NO_ASM + +/* + * On UltraSPARC systems, native ordering is big-endian, but it is + * possible to perform little-endian read accesses by specifying the + * address space 0x88 (ASI_PRIMARY_LITTLE). Basically, either we use + * the opcode "lda [%reg]0x88,%dst", where %reg is the register which + * contains the source address and %dst is the destination register, + * or we use "lda [%reg+imm]%asi,%dst", which uses the %asi register + * to get the address space name. The latter format is better since it + * combines an addition and the actual access in a single opcode; but + * it requires the setting (and subsequent resetting) of %asi, which is + * slow. Some operations (i.e. MD5 compression function) combine many + * successive little-endian read accesses, which may share the same + * %asi setting. The macros below contain the appropriate inline + * assembly. + */ + +#define SPH_SPARCV9_SET_ASI \ + sph_u32 sph_sparcv9_asi; \ + __asm__ __volatile__ ( \ + "rd %%asi,%0\n\twr %%g0,0x88,%%asi" : "=r" (sph_sparcv9_asi)); + +#define SPH_SPARCV9_RESET_ASI \ + __asm__ __volatile__ ("wr %%g0,%0,%%asi" : : "r" (sph_sparcv9_asi)); + +#define SPH_SPARCV9_DEC32LE(base, idx) ({ \ + sph_u32 sph_sparcv9_tmp; \ + __asm__ __volatile__ ("lda [%1+" #idx "*4]%%asi,%0" \ + : "=r" (sph_sparcv9_tmp) : "r" (base)); \ + sph_sparcv9_tmp; \ + }) + +#endif + +static SPH_INLINE void +sph_enc16be(void *dst, unsigned val) +{ + ((unsigned char *)dst)[0] = (val >> 8); + ((unsigned char *)dst)[1] = val; +} + +static SPH_INLINE unsigned +sph_dec16be(const void *src) +{ + return ((unsigned)(((const unsigned char *)src)[0]) << 8) + | (unsigned)(((const unsigned char *)src)[1]); +} + +static SPH_INLINE void +sph_enc16le(void *dst, unsigned val) +{ + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = val >> 8; +} + +static SPH_INLINE unsigned +sph_dec16le(const void *src) +{ + return (unsigned)(((const unsigned char *)src)[0]) + | ((unsigned)(((const unsigned char *)src)[1]) << 8); +} + +/** + * Encode a 32-bit value into the provided buffer (big endian convention). + * + * @param dst the destination buffer + * @param val the 32-bit value to encode + */ +static SPH_INLINE void +sph_enc32be(void *dst, sph_u32 val) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_LITTLE_ENDIAN + val = sph_bswap32(val); +#endif + *(sph_u32 *)dst = val; +#else + if (((SPH_UPTR)dst & 3) == 0) { +#if SPH_LITTLE_ENDIAN + val = sph_bswap32(val); +#endif + *(sph_u32 *)dst = val; + } else { + ((unsigned char *)dst)[0] = (val >> 24); + ((unsigned char *)dst)[1] = (val >> 16); + ((unsigned char *)dst)[2] = (val >> 8); + ((unsigned char *)dst)[3] = val; + } +#endif +#else + ((unsigned char *)dst)[0] = (val >> 24); + ((unsigned char *)dst)[1] = (val >> 16); + ((unsigned char *)dst)[2] = (val >> 8); + ((unsigned char *)dst)[3] = val; +#endif +} + +/** + * Encode a 32-bit value into the provided buffer (big endian convention). + * The destination buffer must be properly aligned. + * + * @param dst the destination buffer (32-bit aligned) + * @param val the value to encode + */ +static SPH_INLINE void +sph_enc32be_aligned(void *dst, sph_u32 val) +{ +#if SPH_LITTLE_ENDIAN + *(sph_u32 *)dst = sph_bswap32(val); +#elif SPH_BIG_ENDIAN + *(sph_u32 *)dst = val; +#else + ((unsigned char *)dst)[0] = (val >> 24); + ((unsigned char *)dst)[1] = (val >> 16); + ((unsigned char *)dst)[2] = (val >> 8); + ((unsigned char *)dst)[3] = val; +#endif +} + +/** + * Decode a 32-bit value from the provided buffer (big endian convention). + * + * @param src the source buffer + * @return the decoded value + */ +static SPH_INLINE sph_u32 +sph_dec32be(const void *src) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_LITTLE_ENDIAN + return sph_bswap32(*(const sph_u32 *)src); +#else + return *(const sph_u32 *)src; +#endif +#else + if (((SPH_UPTR)src & 3) == 0) { +#if SPH_LITTLE_ENDIAN + return sph_bswap32(*(const sph_u32 *)src); +#else + return *(const sph_u32 *)src; +#endif + } else { + return ((sph_u32)(((const unsigned char *)src)[0]) << 24) + | ((sph_u32)(((const unsigned char *)src)[1]) << 16) + | ((sph_u32)(((const unsigned char *)src)[2]) << 8) + | (sph_u32)(((const unsigned char *)src)[3]); + } +#endif +#else + return ((sph_u32)(((const unsigned char *)src)[0]) << 24) + | ((sph_u32)(((const unsigned char *)src)[1]) << 16) + | ((sph_u32)(((const unsigned char *)src)[2]) << 8) + | (sph_u32)(((const unsigned char *)src)[3]); +#endif +} + +/** + * Decode a 32-bit value from the provided buffer (big endian convention). + * The source buffer must be properly aligned. + * + * @param src the source buffer (32-bit aligned) + * @return the decoded value + */ +static SPH_INLINE sph_u32 +sph_dec32be_aligned(const void *src) +{ +#if SPH_LITTLE_ENDIAN + return sph_bswap32(*(const sph_u32 *)src); +#elif SPH_BIG_ENDIAN + return *(const sph_u32 *)src; +#else + return ((sph_u32)(((const unsigned char *)src)[0]) << 24) + | ((sph_u32)(((const unsigned char *)src)[1]) << 16) + | ((sph_u32)(((const unsigned char *)src)[2]) << 8) + | (sph_u32)(((const unsigned char *)src)[3]); +#endif +} + +/** + * Encode a 32-bit value into the provided buffer (little endian convention). + * + * @param dst the destination buffer + * @param val the 32-bit value to encode + */ +static SPH_INLINE void +sph_enc32le(void *dst, sph_u32 val) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_BIG_ENDIAN + val = sph_bswap32(val); +#endif + *(sph_u32 *)dst = val; +#else + if (((SPH_UPTR)dst & 3) == 0) { +#if SPH_BIG_ENDIAN + val = sph_bswap32(val); +#endif + *(sph_u32 *)dst = val; + } else { + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); + } +#endif +#else + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); +#endif +} + +/** + * Encode a 32-bit value into the provided buffer (little endian convention). + * The destination buffer must be properly aligned. + * + * @param dst the destination buffer (32-bit aligned) + * @param val the value to encode + */ +static SPH_INLINE void +sph_enc32le_aligned(void *dst, sph_u32 val) +{ +#if SPH_LITTLE_ENDIAN + *(sph_u32 *)dst = val; +#elif SPH_BIG_ENDIAN + *(sph_u32 *)dst = sph_bswap32(val); +#else + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); +#endif +} + +/** + * Decode a 32-bit value from the provided buffer (little endian convention). + * + * @param src the source buffer + * @return the decoded value + */ +static SPH_INLINE sph_u32 +sph_dec32le(const void *src) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_BIG_ENDIAN + return sph_bswap32(*(const sph_u32 *)src); +#else + return *(const sph_u32 *)src; +#endif +#else + if (((SPH_UPTR)src & 3) == 0) { +#if SPH_BIG_ENDIAN +#if SPH_SPARCV9_GCC && !SPH_NO_ASM + sph_u32 tmp; + + /* + * "__volatile__" is needed here because without it, + * gcc-3.4.3 miscompiles the code and performs the + * access before the test on the address, thus triggering + * a bus error... + */ + __asm__ __volatile__ ( + "lda [%1]0x88,%0" : "=r" (tmp) : "r" (src)); + return tmp; +/* + * On PowerPC, this turns out not to be worth the effort: the inline + * assembly makes GCC optimizer uncomfortable, which tends to nullify + * the decoding gains. + * + * For most hash functions, using this inline assembly trick changes + * hashing speed by less than 5% and often _reduces_ it. The biggest + * gains are for MD4 (+11%) and CubeHash (+30%). For all others, it is + * less then 10%. The speed gain on CubeHash is probably due to the + * chronic shortage of registers that CubeHash endures; for the other + * functions, the generic code appears to be efficient enough already. + * +#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM + sph_u32 tmp; + + __asm__ __volatile__ ( + "lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); + return tmp; + */ +#else + return sph_bswap32(*(const sph_u32 *)src); +#endif +#else + return *(const sph_u32 *)src; +#endif + } else { + return (sph_u32)(((const unsigned char *)src)[0]) + | ((sph_u32)(((const unsigned char *)src)[1]) << 8) + | ((sph_u32)(((const unsigned char *)src)[2]) << 16) + | ((sph_u32)(((const unsigned char *)src)[3]) << 24); + } +#endif +#else + return (sph_u32)(((const unsigned char *)src)[0]) + | ((sph_u32)(((const unsigned char *)src)[1]) << 8) + | ((sph_u32)(((const unsigned char *)src)[2]) << 16) + | ((sph_u32)(((const unsigned char *)src)[3]) << 24); +#endif +} + +/** + * Decode a 32-bit value from the provided buffer (little endian convention). + * The source buffer must be properly aligned. + * + * @param src the source buffer (32-bit aligned) + * @return the decoded value + */ +static SPH_INLINE sph_u32 +sph_dec32le_aligned(const void *src) +{ +#if SPH_LITTLE_ENDIAN + return *(const sph_u32 *)src; +#elif SPH_BIG_ENDIAN +#if SPH_SPARCV9_GCC && !SPH_NO_ASM + sph_u32 tmp; + + __asm__ __volatile__ ("lda [%1]0x88,%0" : "=r" (tmp) : "r" (src)); + return tmp; +/* + * Not worth it generally. + * +#elif (SPH_PPC32_GCC || SPH_PPC64_GCC) && !SPH_NO_ASM + sph_u32 tmp; + + __asm__ __volatile__ ("lwbrx %0,0,%1" : "=r" (tmp) : "r" (src)); + return tmp; + */ +#else + return sph_bswap32(*(const sph_u32 *)src); +#endif +#else + return (sph_u32)(((const unsigned char *)src)[0]) + | ((sph_u32)(((const unsigned char *)src)[1]) << 8) + | ((sph_u32)(((const unsigned char *)src)[2]) << 16) + | ((sph_u32)(((const unsigned char *)src)[3]) << 24); +#endif +} + +#if SPH_64 + +/** + * Encode a 64-bit value into the provided buffer (big endian convention). + * + * @param dst the destination buffer + * @param val the 64-bit value to encode + */ +static SPH_INLINE void +sph_enc64be(void *dst, sph_u64 val) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_LITTLE_ENDIAN + val = sph_bswap64(val); +#endif + *(sph_u64 *)dst = val; +#else + if (((SPH_UPTR)dst & 7) == 0) { +#if SPH_LITTLE_ENDIAN + val = sph_bswap64(val); +#endif + *(sph_u64 *)dst = val; + } else { + ((unsigned char *)dst)[0] = (val >> 56); + ((unsigned char *)dst)[1] = (val >> 48); + ((unsigned char *)dst)[2] = (val >> 40); + ((unsigned char *)dst)[3] = (val >> 32); + ((unsigned char *)dst)[4] = (val >> 24); + ((unsigned char *)dst)[5] = (val >> 16); + ((unsigned char *)dst)[6] = (val >> 8); + ((unsigned char *)dst)[7] = val; + } +#endif +#else + ((unsigned char *)dst)[0] = (val >> 56); + ((unsigned char *)dst)[1] = (val >> 48); + ((unsigned char *)dst)[2] = (val >> 40); + ((unsigned char *)dst)[3] = (val >> 32); + ((unsigned char *)dst)[4] = (val >> 24); + ((unsigned char *)dst)[5] = (val >> 16); + ((unsigned char *)dst)[6] = (val >> 8); + ((unsigned char *)dst)[7] = val; +#endif +} + +/** + * Encode a 64-bit value into the provided buffer (big endian convention). + * The destination buffer must be properly aligned. + * + * @param dst the destination buffer (64-bit aligned) + * @param val the value to encode + */ +static SPH_INLINE void +sph_enc64be_aligned(void *dst, sph_u64 val) +{ +#if SPH_LITTLE_ENDIAN + *(sph_u64 *)dst = sph_bswap64(val); +#elif SPH_BIG_ENDIAN + *(sph_u64 *)dst = val; +#else + ((unsigned char *)dst)[0] = (val >> 56); + ((unsigned char *)dst)[1] = (val >> 48); + ((unsigned char *)dst)[2] = (val >> 40); + ((unsigned char *)dst)[3] = (val >> 32); + ((unsigned char *)dst)[4] = (val >> 24); + ((unsigned char *)dst)[5] = (val >> 16); + ((unsigned char *)dst)[6] = (val >> 8); + ((unsigned char *)dst)[7] = val; +#endif +} + +/** + * Decode a 64-bit value from the provided buffer (big endian convention). + * + * @param src the source buffer + * @return the decoded value + */ +static SPH_INLINE sph_u64 +sph_dec64be(const void *src) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_LITTLE_ENDIAN + return sph_bswap64(*(const sph_u64 *)src); +#else + return *(const sph_u64 *)src; +#endif +#else + if (((SPH_UPTR)src & 7) == 0) { +#if SPH_LITTLE_ENDIAN + return sph_bswap64(*(const sph_u64 *)src); +#else + return *(const sph_u64 *)src; +#endif + } else { + return ((sph_u64)(((const unsigned char *)src)[0]) << 56) + | ((sph_u64)(((const unsigned char *)src)[1]) << 48) + | ((sph_u64)(((const unsigned char *)src)[2]) << 40) + | ((sph_u64)(((const unsigned char *)src)[3]) << 32) + | ((sph_u64)(((const unsigned char *)src)[4]) << 24) + | ((sph_u64)(((const unsigned char *)src)[5]) << 16) + | ((sph_u64)(((const unsigned char *)src)[6]) << 8) + | (sph_u64)(((const unsigned char *)src)[7]); + } +#endif +#else + return ((sph_u64)(((const unsigned char *)src)[0]) << 56) + | ((sph_u64)(((const unsigned char *)src)[1]) << 48) + | ((sph_u64)(((const unsigned char *)src)[2]) << 40) + | ((sph_u64)(((const unsigned char *)src)[3]) << 32) + | ((sph_u64)(((const unsigned char *)src)[4]) << 24) + | ((sph_u64)(((const unsigned char *)src)[5]) << 16) + | ((sph_u64)(((const unsigned char *)src)[6]) << 8) + | (sph_u64)(((const unsigned char *)src)[7]); +#endif +} + +/** + * Decode a 64-bit value from the provided buffer (big endian convention). + * The source buffer must be properly aligned. + * + * @param src the source buffer (64-bit aligned) + * @return the decoded value + */ +static SPH_INLINE sph_u64 +sph_dec64be_aligned(const void *src) +{ +#if SPH_LITTLE_ENDIAN + return sph_bswap64(*(const sph_u64 *)src); +#elif SPH_BIG_ENDIAN + return *(const sph_u64 *)src; +#else + return ((sph_u64)(((const unsigned char *)src)[0]) << 56) + | ((sph_u64)(((const unsigned char *)src)[1]) << 48) + | ((sph_u64)(((const unsigned char *)src)[2]) << 40) + | ((sph_u64)(((const unsigned char *)src)[3]) << 32) + | ((sph_u64)(((const unsigned char *)src)[4]) << 24) + | ((sph_u64)(((const unsigned char *)src)[5]) << 16) + | ((sph_u64)(((const unsigned char *)src)[6]) << 8) + | (sph_u64)(((const unsigned char *)src)[7]); +#endif +} + +/** + * Encode a 64-bit value into the provided buffer (little endian convention). + * + * @param dst the destination buffer + * @param val the 64-bit value to encode + */ +static SPH_INLINE void +sph_enc64le(void *dst, sph_u64 val) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_BIG_ENDIAN + val = sph_bswap64(val); +#endif + *(sph_u64 *)dst = val; +#else + if (((SPH_UPTR)dst & 7) == 0) { +#if SPH_BIG_ENDIAN + val = sph_bswap64(val); +#endif + *(sph_u64 *)dst = val; + } else { + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); + ((unsigned char *)dst)[4] = (val >> 32); + ((unsigned char *)dst)[5] = (val >> 40); + ((unsigned char *)dst)[6] = (val >> 48); + ((unsigned char *)dst)[7] = (val >> 56); + } +#endif +#else + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); + ((unsigned char *)dst)[4] = (val >> 32); + ((unsigned char *)dst)[5] = (val >> 40); + ((unsigned char *)dst)[6] = (val >> 48); + ((unsigned char *)dst)[7] = (val >> 56); +#endif +} + +/** + * Encode a 64-bit value into the provided buffer (little endian convention). + * The destination buffer must be properly aligned. + * + * @param dst the destination buffer (64-bit aligned) + * @param val the value to encode + */ +static SPH_INLINE void +sph_enc64le_aligned(void *dst, sph_u64 val) +{ +#if SPH_LITTLE_ENDIAN + *(sph_u64 *)dst = val; +#elif SPH_BIG_ENDIAN + *(sph_u64 *)dst = sph_bswap64(val); +#else + ((unsigned char *)dst)[0] = val; + ((unsigned char *)dst)[1] = (val >> 8); + ((unsigned char *)dst)[2] = (val >> 16); + ((unsigned char *)dst)[3] = (val >> 24); + ((unsigned char *)dst)[4] = (val >> 32); + ((unsigned char *)dst)[5] = (val >> 40); + ((unsigned char *)dst)[6] = (val >> 48); + ((unsigned char *)dst)[7] = (val >> 56); +#endif +} + +/** + * Decode a 64-bit value from the provided buffer (little endian convention). + * + * @param src the source buffer + * @return the decoded value + */ +static SPH_INLINE sph_u64 +sph_dec64le(const void *src) +{ +#if defined SPH_UPTR +#if SPH_UNALIGNED +#if SPH_BIG_ENDIAN + return sph_bswap64(*(const sph_u64 *)src); +#else + return *(const sph_u64 *)src; +#endif +#else + if (((SPH_UPTR)src & 7) == 0) { +#if SPH_BIG_ENDIAN +#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM + sph_u64 tmp; + + __asm__ __volatile__ ( + "ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src)); + return tmp; +/* + * Not worth it generally. + * +#elif SPH_PPC32_GCC && !SPH_NO_ASM + return (sph_u64)sph_dec32le_aligned(src) + | ((sph_u64)sph_dec32le_aligned( + (const char *)src + 4) << 32); +#elif SPH_PPC64_GCC && !SPH_NO_ASM + sph_u64 tmp; + + __asm__ __volatile__ ( + "ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); + return tmp; + */ +#else + return sph_bswap64(*(const sph_u64 *)src); +#endif +#else + return *(const sph_u64 *)src; +#endif + } else { + return (sph_u64)(((const unsigned char *)src)[0]) + | ((sph_u64)(((const unsigned char *)src)[1]) << 8) + | ((sph_u64)(((const unsigned char *)src)[2]) << 16) + | ((sph_u64)(((const unsigned char *)src)[3]) << 24) + | ((sph_u64)(((const unsigned char *)src)[4]) << 32) + | ((sph_u64)(((const unsigned char *)src)[5]) << 40) + | ((sph_u64)(((const unsigned char *)src)[6]) << 48) + | ((sph_u64)(((const unsigned char *)src)[7]) << 56); + } +#endif +#else + return (sph_u64)(((const unsigned char *)src)[0]) + | ((sph_u64)(((const unsigned char *)src)[1]) << 8) + | ((sph_u64)(((const unsigned char *)src)[2]) << 16) + | ((sph_u64)(((const unsigned char *)src)[3]) << 24) + | ((sph_u64)(((const unsigned char *)src)[4]) << 32) + | ((sph_u64)(((const unsigned char *)src)[5]) << 40) + | ((sph_u64)(((const unsigned char *)src)[6]) << 48) + | ((sph_u64)(((const unsigned char *)src)[7]) << 56); +#endif +} + +/** + * Decode a 64-bit value from the provided buffer (little endian convention). + * The source buffer must be properly aligned. + * + * @param src the source buffer (64-bit aligned) + * @return the decoded value + */ +static SPH_INLINE sph_u64 +sph_dec64le_aligned(const void *src) +{ +#if SPH_LITTLE_ENDIAN + return *(const sph_u64 *)src; +#elif SPH_BIG_ENDIAN +#if SPH_SPARCV9_GCC_64 && !SPH_NO_ASM + sph_u64 tmp; + + __asm__ __volatile__ ("ldxa [%1]0x88,%0" : "=r" (tmp) : "r" (src)); + return tmp; +/* + * Not worth it generally. + * +#elif SPH_PPC32_GCC && !SPH_NO_ASM + return (sph_u64)sph_dec32le_aligned(src) + | ((sph_u64)sph_dec32le_aligned((const char *)src + 4) << 32); +#elif SPH_PPC64_GCC && !SPH_NO_ASM + sph_u64 tmp; + + __asm__ __volatile__ ("ldbrx %0,0,%1" : "=r" (tmp) : "r" (src)); + return tmp; + */ +#else + return sph_bswap64(*(const sph_u64 *)src); +#endif +#else + return (sph_u64)(((const unsigned char *)src)[0]) + | ((sph_u64)(((const unsigned char *)src)[1]) << 8) + | ((sph_u64)(((const unsigned char *)src)[2]) << 16) + | ((sph_u64)(((const unsigned char *)src)[3]) << 24) + | ((sph_u64)(((const unsigned char *)src)[4]) << 32) + | ((sph_u64)(((const unsigned char *)src)[5]) << 40) + | ((sph_u64)(((const unsigned char *)src)[6]) << 48) + | ((sph_u64)(((const unsigned char *)src)[7]) << 56); +#endif +} + +#endif + +#endif /* Doxygen excluded block */ + +#endif diff --git a/util.c b/util.c new file mode 100644 index 00000000..f2afed87 --- /dev/null +++ b/util.c @@ -0,0 +1,1318 @@ +/* + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. See COPYING for more details. + */ + +#define _GNU_SOURCE +#include "cpuminer-config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(WIN32) +#include +#include +#else +#include +#include +#include +#include +#endif +#include "compat.h" +#include "miner.h" +#include "elist.h" + +struct data_buffer { + void *buf; + size_t len; +}; + +struct upload_buffer { + const void *buf; + size_t len; + size_t pos; +}; + +struct header_info { + char *lp_path; + char *reason; + char *stratum_url; +}; + +struct tq_ent { + void *data; + struct list_head q_node; +}; + +struct thread_q { + struct list_head q; + + bool frozen; + + pthread_mutex_t mutex; + pthread_cond_t cond; +}; + +void applog(int prio, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + +#ifdef HAVE_SYSLOG_H + if (use_syslog) { + va_list ap2; + char *buf; + int len; + + va_copy(ap2, ap); + len = vsnprintf(NULL, 0, fmt, ap2) + 1; + va_end(ap2); + buf = alloca(len); + if (vsnprintf(buf, len, fmt, ap) >= 0) + syslog(prio, "%s", buf); + } +#else + if (0) {} +#endif + else { + char *f; + int len; + time_t now; + struct tm tm, *tm_p; + + time(&now); + + pthread_mutex_lock(&applog_lock); + tm_p = localtime(&now); + memcpy(&tm, tm_p, sizeof(tm)); + pthread_mutex_unlock(&applog_lock); + + len = 40 + strlen(fmt) + 2; + f = alloca(len); + sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d] %s\n", + tm.tm_year + 1900, + tm.tm_mon + 1, + tm.tm_mday, + tm.tm_hour, + tm.tm_min, + tm.tm_sec, + fmt); + pthread_mutex_lock(&applog_lock); + vfprintf(stderr, f, ap); /* atomic write to stderr */ + fflush(stderr); + pthread_mutex_unlock(&applog_lock); + } + va_end(ap); +} + +static void databuf_free(struct data_buffer *db) +{ + if (!db) + return; + + free(db->buf); + + memset(db, 0, sizeof(*db)); +} + +static size_t all_data_cb(const void *ptr, size_t size, size_t nmemb, + void *user_data) +{ + struct data_buffer *db = user_data; + size_t len = size * nmemb; + size_t oldlen, newlen; + void *newmem; + static const unsigned char zero = 0; + + oldlen = db->len; + newlen = oldlen + len; + + newmem = realloc(db->buf, newlen + 1); + if (!newmem) + return 0; + + db->buf = newmem; + db->len = newlen; + memcpy(db->buf + oldlen, ptr, len); + memcpy(db->buf + newlen, &zero, 1); /* null terminate */ + + return len; +} + +static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb, + void *user_data) +{ + struct upload_buffer *ub = user_data; + int len = size * nmemb; + + if (len > ub->len - ub->pos) + len = ub->len - ub->pos; + + if (len) { + memcpy(ptr, ub->buf + ub->pos, len); + ub->pos += len; + } + + return len; +} + +#if LIBCURL_VERSION_NUM >= 0x071200 +static int seek_data_cb(void *user_data, curl_off_t offset, int origin) +{ + struct upload_buffer *ub = user_data; + + switch (origin) { + case SEEK_SET: + ub->pos = offset; + break; + case SEEK_CUR: + ub->pos += offset; + break; + case SEEK_END: + ub->pos = ub->len + offset; + break; + default: + return 1; /* CURL_SEEKFUNC_FAIL */ + } + + return 0; /* CURL_SEEKFUNC_OK */ +} +#endif + +static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data) +{ + struct header_info *hi = user_data; + size_t remlen, slen, ptrlen = size * nmemb; + char *rem, *val = NULL, *key = NULL; + void *tmp; + + val = calloc(1, ptrlen); + key = calloc(1, ptrlen); + if (!key || !val) + goto out; + + tmp = memchr(ptr, ':', ptrlen); + if (!tmp || (tmp == ptr)) /* skip empty keys / blanks */ + goto out; + slen = tmp - ptr; + if ((slen + 1) == ptrlen) /* skip key w/ no value */ + goto out; + memcpy(key, ptr, slen); /* store & nul term key */ + key[slen] = 0; + + rem = ptr + slen + 1; /* trim value's leading whitespace */ + remlen = ptrlen - slen - 1; + while ((remlen > 0) && (isspace(*rem))) { + remlen--; + rem++; + } + + memcpy(val, rem, remlen); /* store value, trim trailing ws */ + val[remlen] = 0; + while ((*val) && (isspace(val[strlen(val) - 1]))) { + val[strlen(val) - 1] = 0; + } + if (!*val) /* skip blank value */ + goto out; + + if (!strcasecmp("X-Long-Polling", key)) { + hi->lp_path = val; /* steal memory reference */ + val = NULL; + } + + if (!strcasecmp("X-Reject-Reason", key)) { + hi->reason = val; /* steal memory reference */ + val = NULL; + } + + if (!strcasecmp("X-Stratum", key)) { + hi->stratum_url = val; /* steal memory reference */ + val = NULL; + } + +out: + free(key); + free(val); + return ptrlen; +} + +#if LIBCURL_VERSION_NUM >= 0x070f06 +static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, + curlsocktype purpose) +{ + int keepalive = 1; + int tcp_keepcnt = 3; + int tcp_keepidle = 50; + int tcp_keepintvl = 50; + +#ifndef WIN32 + if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive, + sizeof(keepalive)))) + return 1; +#ifdef __linux + if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPCNT, + &tcp_keepcnt, sizeof(tcp_keepcnt)))) + return 1; + if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPIDLE, + &tcp_keepidle, sizeof(tcp_keepidle)))) + return 1; + if (unlikely(setsockopt(fd, SOL_TCP, TCP_KEEPINTVL, + &tcp_keepintvl, sizeof(tcp_keepintvl)))) + return 1; +#endif /* __linux */ +#ifdef __APPLE_CC__ + if (unlikely(setsockopt(fd, IPPROTO_TCP, TCP_KEEPALIVE, + &tcp_keepintvl, sizeof(tcp_keepintvl)))) + return 1; +#endif /* __APPLE_CC__ */ +#else /* WIN32 */ + struct tcp_keepalive vals; + vals.onoff = 1; + vals.keepalivetime = tcp_keepidle * 1000; + vals.keepaliveinterval = tcp_keepintvl * 1000; + DWORD outputBytes; + if (unlikely(WSAIoctl(fd, SIO_KEEPALIVE_VALS, &vals, sizeof(vals), + NULL, 0, &outputBytes, NULL, NULL))) + return 1; +#endif /* WIN32 */ + + return 0; +} +#endif + +json_t *json_rpc_call(CURL *curl, const char *url, + const char *userpass, const char *rpc_req, + int *curl_err, int flags) +{ + json_t *val, *err_val, *res_val; + int rc; + long http_rc; + struct data_buffer all_data = {0}; + struct upload_buffer upload_data; + json_error_t err; + struct curl_slist *headers = NULL; + char len_hdr[64]; + char curl_err_str[CURL_ERROR_SIZE]; + long timeout = (flags & JSON_RPC_LONGPOLL) ? opt_timeout : 30; + struct header_info hi = {0}; + + /* it is assumed that 'curl' is freshly [re]initialized at this pt */ + + if (opt_protocol) + curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); + curl_easy_setopt(curl, CURLOPT_URL, url); + if (opt_cert) + curl_easy_setopt(curl, CURLOPT_CAINFO, opt_cert); + curl_easy_setopt(curl, CURLOPT_ENCODING, ""); + curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1); + curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); + curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data); + curl_easy_setopt(curl, CURLOPT_READFUNCTION, upload_data_cb); + curl_easy_setopt(curl, CURLOPT_READDATA, &upload_data); +#if LIBCURL_VERSION_NUM >= 0x071200 + curl_easy_setopt(curl, CURLOPT_SEEKFUNCTION, &seek_data_cb); + curl_easy_setopt(curl, CURLOPT_SEEKDATA, &upload_data); +#endif + curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str); + if (opt_redirect) + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout); + curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, resp_hdr_cb); + curl_easy_setopt(curl, CURLOPT_HEADERDATA, &hi); + if (opt_proxy) { + curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); + } + if (userpass) { + curl_easy_setopt(curl, CURLOPT_USERPWD, userpass); + curl_easy_setopt(curl, CURLOPT_HTTPAUTH, CURLAUTH_BASIC); + } +#if LIBCURL_VERSION_NUM >= 0x070f06 + if (flags & JSON_RPC_LONGPOLL) + curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb); +#endif + curl_easy_setopt(curl, CURLOPT_POST, 1); + + if (opt_protocol) + applog(LOG_DEBUG, "JSON protocol request:\n%s\n", rpc_req); + + upload_data.buf = rpc_req; + upload_data.len = strlen(rpc_req); + upload_data.pos = 0; + sprintf(len_hdr, "Content-Length: %lu", + (unsigned long) upload_data.len); + + headers = curl_slist_append(headers, "Content-Type: application/json"); + headers = curl_slist_append(headers, len_hdr); + headers = curl_slist_append(headers, "User-Agent: " USER_AGENT); + headers = curl_slist_append(headers, "X-Mining-Extensions: midstate"); + headers = curl_slist_append(headers, "Accept:"); /* disable Accept hdr*/ + headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/ + + curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); + + rc = curl_easy_perform(curl); + if (curl_err != NULL) + *curl_err = rc; + if (rc) { + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_rc); + if (!((flags & JSON_RPC_LONGPOLL) && rc == CURLE_OPERATION_TIMEDOUT) && + !((flags & JSON_RPC_QUIET_404) && http_rc == 404)) + applog(LOG_ERR, "HTTP request failed: %s", curl_err_str); + goto err_out; + } + + /* If X-Stratum was found, activate Stratum */ + if (want_stratum && hi.stratum_url && + !strncasecmp(hi.stratum_url, "stratum+tcp://", 14)) { + have_stratum = true; + tq_push(thr_info[stratum_thr_id].q, hi.stratum_url); + hi.stratum_url = NULL; + } + + /* If X-Long-Polling was found, activate long polling */ + if (!have_longpoll && want_longpoll && hi.lp_path && !have_stratum) { + have_longpoll = true; + tq_push(thr_info[longpoll_thr_id].q, hi.lp_path); + hi.lp_path = NULL; + } + + if (!all_data.buf) { + applog(LOG_ERR, "Empty data received in json_rpc_call."); + goto err_out; + } + + val = JSON_LOADS(all_data.buf, &err); + if (!val) { + applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); + goto err_out; + } + + if (opt_protocol) { + char *s = json_dumps(val, JSON_INDENT(3)); + applog(LOG_DEBUG, "JSON protocol response:\n%s", s); + free(s); + } + + /* JSON-RPC valid response returns a non-null 'result', + * and a null 'error'. */ + res_val = json_object_get(val, "result"); + err_val = json_object_get(val, "error"); + + if (!res_val || json_is_null(res_val) || + (err_val && !json_is_null(err_val))) { + char *s; + + if (err_val) + s = json_dumps(err_val, JSON_INDENT(3)); + else + s = strdup("(unknown reason)"); + + applog(LOG_ERR, "JSON-RPC call failed: %s", s); + + free(s); + + goto err_out; + } + + if (hi.reason) + json_object_set_new(val, "reject-reason", json_string(hi.reason)); + + databuf_free(&all_data); + curl_slist_free_all(headers); + curl_easy_reset(curl); + return val; + +err_out: + free(hi.lp_path); + free(hi.reason); + free(hi.stratum_url); + databuf_free(&all_data); + curl_slist_free_all(headers); + curl_easy_reset(curl); + return NULL; +} + +char *bin2hex(const unsigned char *p, size_t len) +{ + int i; + char *s = malloc((len * 2) + 1); + if (!s) + return NULL; + + for (i = 0; i < len; i++) + sprintf(s + (i * 2), "%02x", (unsigned int) p[i]); + + return s; +} + +bool hex2bin(unsigned char *p, const char *hexstr, size_t len) +{ + char hex_byte[3]; + char *ep; + + hex_byte[2] = '\0'; + + while (*hexstr && len) { + if (!hexstr[1]) { + applog(LOG_ERR, "hex2bin str truncated"); + return false; + } + hex_byte[0] = hexstr[0]; + hex_byte[1] = hexstr[1]; + *p = (unsigned char) strtol(hex_byte, &ep, 16); + if (*ep) { + applog(LOG_ERR, "hex2bin failed on '%s'", hex_byte); + return false; + } + p++; + hexstr += 2; + len--; + } + + return (len == 0 && *hexstr == 0) ? true : false; +} + +/* Subtract the `struct timeval' values X and Y, + storing the result in RESULT. + Return 1 if the difference is negative, otherwise 0. */ +int timeval_subtract(struct timeval *result, struct timeval *x, + struct timeval *y) +{ + /* Perform the carry for the later subtraction by updating Y. */ + if (x->tv_usec < y->tv_usec) { + int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; + y->tv_usec -= 1000000 * nsec; + y->tv_sec += nsec; + } + if (x->tv_usec - y->tv_usec > 1000000) { + int nsec = (x->tv_usec - y->tv_usec) / 1000000; + y->tv_usec += 1000000 * nsec; + y->tv_sec -= nsec; + } + + /* Compute the time remaining to wait. + * `tv_usec' is certainly positive. */ + result->tv_sec = x->tv_sec - y->tv_sec; + result->tv_usec = x->tv_usec - y->tv_usec; + + /* Return 1 if result is negative. */ + return x->tv_sec < y->tv_sec; +} + +bool fulltest(const uint32_t *hash, const uint32_t *target) +{ + int i; + bool rc = true; + + for (i = 7; i >= 0; i--) { + if (hash[i] > target[i]) { + rc = false; + break; + } + if (hash[i] < target[i]) { + rc = true; + break; + } + } + + if (opt_debug) { + uint32_t hash_be[8], target_be[8]; + char *hash_str, *target_str; + + for (i = 0; i < 8; i++) { + be32enc(hash_be + i, hash[7 - i]); + be32enc(target_be + i, target[7 - i]); + } + hash_str = bin2hex((unsigned char *)hash_be, 32); + target_str = bin2hex((unsigned char *)target_be, 32); + + applog(LOG_DEBUG, "DEBUG: %s\nHash: %s\nTarget: %s", + rc ? "hash <= target" + : "hash > target (false positive)", + hash_str, + target_str); + + free(hash_str); + free(target_str); + } + + return rc; +} + +void diff_to_target(uint32_t *target, double diff) +{ + uint64_t m; + int k; + + for (k = 6; k > 0 && diff > 1.0; k--) + diff /= 4294967296.0; + m = 4294901760.0 / diff; + if (m == 0 && k == 6) + memset(target, 0xff, 32); + else { + memset(target, 0, 32); + target[k] = (uint32_t)m; + target[k + 1] = (uint32_t)(m >> 32); + } +} + +#ifdef WIN32 +#define socket_blocks() (WSAGetLastError() == WSAEWOULDBLOCK) +#else +#define socket_blocks() (errno == EAGAIN || errno == EWOULDBLOCK) +#endif + +static bool send_line(curl_socket_t sock, char *s) +{ + ssize_t len, sent = 0; + + len = strlen(s); + s[len++] = '\n'; + + while (len > 0) { + struct timeval timeout = {0, 0}; + ssize_t n; + fd_set wd; + + FD_ZERO(&wd); + FD_SET(sock, &wd); + if (select(sock + 1, NULL, &wd, NULL, &timeout) < 1) + return false; + n = send(sock, s + sent, len, 0); + if (n < 0) { + if (!socket_blocks()) + return false; + n = 0; + } + sent += n; + len -= n; + } + + return true; +} + +bool stratum_send_line(struct stratum_ctx *sctx, char *s) +{ + bool ret = false; + + if (opt_protocol) + applog(LOG_DEBUG, "> %s", s); + + pthread_mutex_lock(&sctx->sock_lock); + ret = send_line(sctx->sock, s); + pthread_mutex_unlock(&sctx->sock_lock); + + return ret; +} + +static bool socket_full(curl_socket_t sock, int timeout) +{ + struct timeval tv; + fd_set rd; + + FD_ZERO(&rd); + FD_SET(sock, &rd); + tv.tv_sec = timeout; + tv.tv_usec = 0; + if (select(sock + 1, &rd, NULL, NULL, &tv) > 0) + return true; + return false; +} + +bool stratum_socket_full(struct stratum_ctx *sctx, int timeout) +{ + return strlen(sctx->sockbuf) || socket_full(sctx->sock, timeout); +} + +#define RBUFSIZE 2048 +#define RECVSIZE (RBUFSIZE - 4) + +static void stratum_buffer_append(struct stratum_ctx *sctx, const char *s) +{ + size_t old, new; + + old = strlen(sctx->sockbuf); + new = old + strlen(s) + 1; + if (new >= sctx->sockbuf_size) { + sctx->sockbuf_size = new + (RBUFSIZE - (new % RBUFSIZE)); + sctx->sockbuf = realloc(sctx->sockbuf, sctx->sockbuf_size); + } + strcpy(sctx->sockbuf + old, s); +} + +char *stratum_recv_line(struct stratum_ctx *sctx) +{ + ssize_t len, buflen; + char *tok, *sret = NULL; + + if (!strstr(sctx->sockbuf, "\n")) { + bool ret = true; + time_t rstart; + + time(&rstart); + if (!socket_full(sctx->sock, 60)) { + applog(LOG_ERR, "stratum_recv_line timed out"); + goto out; + } + do { + char s[RBUFSIZE]; + ssize_t n; + + memset(s, 0, RBUFSIZE); + n = recv(sctx->sock, s, RECVSIZE, 0); + if (!n) { + ret = false; + break; + } + if (n < 0) { + if (!socket_blocks() || !socket_full(sctx->sock, 1)) { + ret = false; + break; + } + } else + stratum_buffer_append(sctx, s); + } while (time(NULL) - rstart < 60 && !strstr(sctx->sockbuf, "\n")); + + if (!ret) { + applog(LOG_ERR, "stratum_recv_line failed"); + goto out; + } + } + + buflen = strlen(sctx->sockbuf); + tok = strtok(sctx->sockbuf, "\n"); + if (!tok) { + applog(LOG_ERR, "stratum_recv_line failed to parse a newline-terminated string"); + goto out; + } + sret = strdup(tok); + len = strlen(sret); + + if (buflen > len + 1) + memmove(sctx->sockbuf, sctx->sockbuf + len + 1, buflen - len + 1); + else + sctx->sockbuf[0] = '\0'; + +out: + if (sret && opt_protocol) + applog(LOG_DEBUG, "< %s", sret); + return sret; +} + +#if LIBCURL_VERSION_NUM >= 0x071101 +static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose, + struct curl_sockaddr *addr) +{ + curl_socket_t *sock = clientp; + *sock = socket(addr->family, addr->socktype, addr->protocol); + return *sock; +} +#endif + +bool stratum_connect(struct stratum_ctx *sctx, const char *url) +{ + CURL *curl; + int rc; + + pthread_mutex_lock(&sctx->sock_lock); + if (sctx->curl) + curl_easy_cleanup(sctx->curl); + sctx->curl = curl_easy_init(); + if (!sctx->curl) { + applog(LOG_ERR, "CURL initialization failed"); + pthread_mutex_unlock(&sctx->sock_lock); + return false; + } + curl = sctx->curl; + if (!sctx->sockbuf) { + sctx->sockbuf = calloc(RBUFSIZE, 1); + sctx->sockbuf_size = RBUFSIZE; + } + sctx->sockbuf[0] = '\0'; + pthread_mutex_unlock(&sctx->sock_lock); + + if (url != sctx->url) { + free(sctx->url); + sctx->url = strdup(url); + } + free(sctx->curl_url); + sctx->curl_url = malloc(strlen(url)); + sprintf(sctx->curl_url, "http%s", strstr(url, "://")); + + if (opt_protocol) + curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); + curl_easy_setopt(curl, CURLOPT_URL, sctx->curl_url); + curl_easy_setopt(curl, CURLOPT_FRESH_CONNECT, 1); + curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30); + curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, sctx->curl_err_str); + curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); + curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1); + if (opt_proxy) { + curl_easy_setopt(curl, CURLOPT_PROXY, opt_proxy); + curl_easy_setopt(curl, CURLOPT_PROXYTYPE, opt_proxy_type); + } + curl_easy_setopt(curl, CURLOPT_HTTPPROXYTUNNEL, 1); +#if LIBCURL_VERSION_NUM >= 0x070f06 + curl_easy_setopt(curl, CURLOPT_SOCKOPTFUNCTION, sockopt_keepalive_cb); +#endif +#if LIBCURL_VERSION_NUM >= 0x071101 + curl_easy_setopt(curl, CURLOPT_OPENSOCKETFUNCTION, opensocket_grab_cb); + curl_easy_setopt(curl, CURLOPT_OPENSOCKETDATA, &sctx->sock); +#endif + curl_easy_setopt(curl, CURLOPT_CONNECT_ONLY, 1); + + rc = curl_easy_perform(curl); + if (rc) { + applog(LOG_ERR, "Stratum connection failed: %s", sctx->curl_err_str); + curl_easy_cleanup(curl); + sctx->curl = NULL; + return false; + } + +#if LIBCURL_VERSION_NUM < 0x071101 + /* CURLINFO_LASTSOCKET is broken on Win64; only use it as a last resort */ + curl_easy_getinfo(curl, CURLINFO_LASTSOCKET, (long *)&sctx->sock); +#endif + + return true; +} + +void stratum_disconnect(struct stratum_ctx *sctx) +{ + pthread_mutex_lock(&sctx->sock_lock); + if (sctx->curl) { + curl_easy_cleanup(sctx->curl); + sctx->curl = NULL; + sctx->sockbuf[0] = '\0'; + } + pthread_mutex_unlock(&sctx->sock_lock); +} + +static const char *get_stratum_session_id(json_t *val) +{ + json_t *arr_val; + int i, n; + + arr_val = json_array_get(val, 0); + if (!arr_val || !json_is_array(arr_val)) + return NULL; + n = json_array_size(arr_val); + for (i = 0; i < n; i++) { + const char *notify; + json_t *arr = json_array_get(arr_val, i); + + if (!arr || !json_is_array(arr)) + break; + notify = json_string_value(json_array_get(arr, 0)); + if (!notify) + continue; + if (!strcasecmp(notify, "mining.notify")) + return json_string_value(json_array_get(arr, 1)); + } + return NULL; +} + +bool stratum_subscribe(struct stratum_ctx *sctx) +{ + char *s, *sret = NULL; + const char *sid, *xnonce1; + int xn2_size; + json_t *val = NULL, *res_val, *err_val; + json_error_t err; + bool ret = false, retry = false; + +start: + s = malloc(128 + (sctx->session_id ? strlen(sctx->session_id) : 0)); + if (retry) + sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": []}"); + else if (sctx->session_id) + sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\", \"%s\"]}", sctx->session_id); + else + sprintf(s, "{\"id\": 1, \"method\": \"mining.subscribe\", \"params\": [\"" USER_AGENT "\"]}"); + + if (!stratum_send_line(sctx, s)) { + applog(LOG_ERR, "stratum_subscribe send failed"); + goto out; + } + + if (!socket_full(sctx->sock, 30)) { + applog(LOG_ERR, "stratum_subscribe timed out"); + goto out; + } + + sret = stratum_recv_line(sctx); + if (!sret) + goto out; + + val = JSON_LOADS(sret, &err); + free(sret); + if (!val) { + applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); + goto out; + } + + res_val = json_object_get(val, "result"); + err_val = json_object_get(val, "error"); + + if (!res_val || json_is_null(res_val) || + (err_val && !json_is_null(err_val))) { + if (opt_debug || retry) { + free(s); + if (err_val) + s = json_dumps(err_val, JSON_INDENT(3)); + else + s = strdup("(unknown reason)"); + applog(LOG_ERR, "JSON-RPC call failed: %s", s); + } + goto out; + } + + sid = get_stratum_session_id(res_val); + if (opt_debug && !sid) + applog(LOG_DEBUG, "Failed to get Stratum session id"); + xnonce1 = json_string_value(json_array_get(res_val, 1)); + if (!xnonce1) { + applog(LOG_ERR, "Failed to get extranonce1"); + goto out; + } + xn2_size = json_integer_value(json_array_get(res_val, 2)); + if (!xn2_size) { + applog(LOG_ERR, "Failed to get extranonce2_size"); + goto out; + } + + pthread_mutex_lock(&sctx->work_lock); + free(sctx->session_id); + free(sctx->xnonce1); + sctx->session_id = sid ? strdup(sid) : NULL; + sctx->xnonce1_size = strlen(xnonce1) / 2; + sctx->xnonce1 = malloc(sctx->xnonce1_size); + hex2bin(sctx->xnonce1, xnonce1, sctx->xnonce1_size); + sctx->xnonce2_size = xn2_size; + sctx->next_diff = 1.0; + pthread_mutex_unlock(&sctx->work_lock); + + if (opt_debug && sid) + applog(LOG_DEBUG, "Stratum session id: %s", sctx->session_id); + + ret = true; + +out: + free(s); + if (val) + json_decref(val); + + if (!ret) { + if (sret && !retry) { + retry = true; + goto start; + } + } + + return ret; +} + +bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *pass) +{ + json_t *val = NULL, *res_val, *err_val; + char *s, *sret; + json_error_t err; + bool ret = false; + + s = malloc(80 + strlen(user) + strlen(pass)); + sprintf(s, "{\"id\": 2, \"method\": \"mining.authorize\", \"params\": [\"%s\", \"%s\"]}", + user, pass); + + if (!stratum_send_line(sctx, s)) + goto out; + + while (1) { + sret = stratum_recv_line(sctx); + if (!sret) + goto out; + if (!stratum_handle_method(sctx, sret)) + break; + free(sret); + } + + val = JSON_LOADS(sret, &err); + free(sret); + if (!val) { + applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); + goto out; + } + + res_val = json_object_get(val, "result"); + err_val = json_object_get(val, "error"); + + if (!res_val || json_is_false(res_val) || + (err_val && !json_is_null(err_val))) { + applog(LOG_ERR, "Stratum authentication failed"); + goto out; + } + + ret = true; + +out: + free(s); + if (val) + json_decref(val); + + return ret; +} + +static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) +{ + const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *ntime; + size_t coinb1_size, coinb2_size; + bool clean, ret = false; + int merkle_count, i; + json_t *merkle_arr; + unsigned char **merkle; + + job_id = json_string_value(json_array_get(params, 0)); + prevhash = json_string_value(json_array_get(params, 1)); + coinb1 = json_string_value(json_array_get(params, 2)); + coinb2 = json_string_value(json_array_get(params, 3)); + merkle_arr = json_array_get(params, 4); + if (!merkle_arr || !json_is_array(merkle_arr)) + goto out; + merkle_count = json_array_size(merkle_arr); + version = json_string_value(json_array_get(params, 5)); + nbits = json_string_value(json_array_get(params, 6)); + ntime = json_string_value(json_array_get(params, 7)); + clean = json_is_true(json_array_get(params, 8)); + + if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !ntime || + strlen(prevhash) != 64 || strlen(version) != 8 || + strlen(nbits) != 8 || strlen(ntime) != 8) { + applog(LOG_ERR, "Stratum notify: invalid parameters"); + goto out; + } + merkle = malloc(merkle_count * sizeof(char *)); + for (i = 0; i < merkle_count; i++) { + const char *s = json_string_value(json_array_get(merkle_arr, i)); + if (!s || strlen(s) != 64) { + while (i--) + free(merkle[i]); + free(merkle); + applog(LOG_ERR, "Stratum notify: invalid Merkle branch"); + goto out; + } + merkle[i] = malloc(32); + hex2bin(merkle[i], s, 32); + } + + pthread_mutex_lock(&sctx->work_lock); + + coinb1_size = strlen(coinb1) / 2; + coinb2_size = strlen(coinb2) / 2; + sctx->job.coinbase_size = coinb1_size + sctx->xnonce1_size + + sctx->xnonce2_size + coinb2_size; + sctx->job.coinbase = realloc(sctx->job.coinbase, sctx->job.coinbase_size); + sctx->job.xnonce2 = sctx->job.coinbase + coinb1_size + sctx->xnonce1_size; + hex2bin(sctx->job.coinbase, coinb1, coinb1_size); + memcpy(sctx->job.coinbase + coinb1_size, sctx->xnonce1, sctx->xnonce1_size); + if (!sctx->job.job_id || strcmp(sctx->job.job_id, job_id)) + memset(sctx->job.xnonce2, 0, sctx->xnonce2_size); + hex2bin(sctx->job.xnonce2 + sctx->xnonce2_size, coinb2, coinb2_size); + + free(sctx->job.job_id); + sctx->job.job_id = strdup(job_id); + hex2bin(sctx->job.prevhash, prevhash, 32); + + for (i = 0; i < sctx->job.merkle_count; i++) + free(sctx->job.merkle[i]); + free(sctx->job.merkle); + sctx->job.merkle = merkle; + sctx->job.merkle_count = merkle_count; + + hex2bin(sctx->job.version, version, 4); + hex2bin(sctx->job.nbits, nbits, 4); + hex2bin(sctx->job.ntime, ntime, 4); + sctx->job.clean = clean; + + sctx->job.diff = sctx->next_diff; + + pthread_mutex_unlock(&sctx->work_lock); + + ret = true; + +out: + return ret; +} + +static bool stratum_set_difficulty(struct stratum_ctx *sctx, json_t *params) +{ + double diff; + + diff = json_number_value(json_array_get(params, 0)); + if (diff == 0) + return false; + + pthread_mutex_lock(&sctx->work_lock); + sctx->next_diff = diff; + pthread_mutex_unlock(&sctx->work_lock); + + if (opt_debug) + applog(LOG_DEBUG, "Stratum difficulty set to %g", diff); + + return true; +} + +static bool stratum_reconnect(struct stratum_ctx *sctx, json_t *params) +{ + json_t *port_val; + char *url; + const char *host; + int port; + + host = json_string_value(json_array_get(params, 0)); + port_val = json_array_get(params, 1); + if (json_is_string(port_val)) + port = atoi(json_string_value(port_val)); + else + port = json_integer_value(port_val); + if (!host || !port) + return false; + + url = malloc(32 + strlen(host)); + sprintf(url, "stratum+tcp://%s:%d", host, port); + + if (!opt_redirect) { + applog(LOG_INFO, "Ignoring request to reconnect to %s", url); + free(url); + return true; + } + + applog(LOG_NOTICE, "Server requested reconnection to %s", url); + + free(sctx->url); + sctx->url = url; + stratum_disconnect(sctx); + + return true; +} + +static bool stratum_get_version(struct stratum_ctx *sctx, json_t *id) +{ + char *s; + json_t *val; + bool ret; + + if (!id || json_is_null(id)) + return false; + + val = json_object(); + json_object_set(val, "id", id); + json_object_set_new(val, "error", json_null()); + json_object_set_new(val, "result", json_string(USER_AGENT)); + s = json_dumps(val, 0); + ret = stratum_send_line(sctx, s); + json_decref(val); + free(s); + + return ret; +} + +static bool stratum_show_message(struct stratum_ctx *sctx, json_t *id, json_t *params) +{ + char *s; + json_t *val; + bool ret; + + val = json_array_get(params, 0); + if (val) + applog(LOG_NOTICE, "MESSAGE FROM SERVER: %s", json_string_value(val)); + + if (!id || json_is_null(id)) + return true; + + val = json_object(); + json_object_set(val, "id", id); + json_object_set_new(val, "error", json_null()); + json_object_set_new(val, "result", json_true()); + s = json_dumps(val, 0); + ret = stratum_send_line(sctx, s); + json_decref(val); + free(s); + + return ret; +} + +bool stratum_handle_method(struct stratum_ctx *sctx, const char *s) +{ + json_t *val, *id, *params; + json_error_t err; + const char *method; + bool ret = false; + + val = JSON_LOADS(s, &err); + if (!val) { + applog(LOG_ERR, "JSON decode failed(%d): %s", err.line, err.text); + goto out; + } + + method = json_string_value(json_object_get(val, "method")); + if (!method) + goto out; + id = json_object_get(val, "id"); + params = json_object_get(val, "params"); + + if (!strcasecmp(method, "mining.notify")) { + ret = stratum_notify(sctx, params); + goto out; + } + if (!strcasecmp(method, "mining.set_difficulty")) { + ret = stratum_set_difficulty(sctx, params); + goto out; + } + if (!strcasecmp(method, "client.reconnect")) { + ret = stratum_reconnect(sctx, params); + goto out; + } + if (!strcasecmp(method, "client.get_version")) { + ret = stratum_get_version(sctx, id); + goto out; + } + if (!strcasecmp(method, "client.show_message")) { + ret = stratum_show_message(sctx, id, params); + goto out; + } + +out: + if (val) + json_decref(val); + + return ret; +} + +struct thread_q *tq_new(void) +{ + struct thread_q *tq; + + tq = calloc(1, sizeof(*tq)); + if (!tq) + return NULL; + + INIT_LIST_HEAD(&tq->q); + pthread_mutex_init(&tq->mutex, NULL); + pthread_cond_init(&tq->cond, NULL); + + return tq; +} + +void tq_free(struct thread_q *tq) +{ + struct tq_ent *ent, *iter; + + if (!tq) + return; + + list_for_each_entry_safe(ent, iter, &tq->q, q_node) { + list_del(&ent->q_node); + free(ent); + } + + pthread_cond_destroy(&tq->cond); + pthread_mutex_destroy(&tq->mutex); + + memset(tq, 0, sizeof(*tq)); /* poison */ + free(tq); +} + +static void tq_freezethaw(struct thread_q *tq, bool frozen) +{ + pthread_mutex_lock(&tq->mutex); + + tq->frozen = frozen; + + pthread_cond_signal(&tq->cond); + pthread_mutex_unlock(&tq->mutex); +} + +void tq_freeze(struct thread_q *tq) +{ + tq_freezethaw(tq, true); +} + +void tq_thaw(struct thread_q *tq) +{ + tq_freezethaw(tq, false); +} + +bool tq_push(struct thread_q *tq, void *data) +{ + struct tq_ent *ent; + bool rc = true; + + ent = calloc(1, sizeof(*ent)); + if (!ent) + return false; + + ent->data = data; + INIT_LIST_HEAD(&ent->q_node); + + pthread_mutex_lock(&tq->mutex); + + if (!tq->frozen) { + list_add_tail(&ent->q_node, &tq->q); + } else { + free(ent); + rc = false; + } + + pthread_cond_signal(&tq->cond); + pthread_mutex_unlock(&tq->mutex); + + return rc; +} + +void *tq_pop(struct thread_q *tq, const struct timespec *abstime) +{ + struct tq_ent *ent; + void *rval = NULL; + int rc; + + pthread_mutex_lock(&tq->mutex); + + if (!list_empty(&tq->q)) + goto pop; + + if (abstime) + rc = pthread_cond_timedwait(&tq->cond, &tq->mutex, abstime); + else + rc = pthread_cond_wait(&tq->cond, &tq->mutex); + if (rc) + goto out; + if (list_empty(&tq->q)) + goto out; + +pop: + ent = list_entry(tq->q.next, struct tq_ent, q_node); + rval = ent->data; + + list_del(&ent->q_node); + free(ent); + +out: + pthread_mutex_unlock(&tq->mutex); + return rval; +}