diff --git a/INSTALL b/INSTALL
index 988761c8db..cac32a1b62 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,77 +1,97 @@
-
-You can use ./build.sh to configure and build with default options.
-
-It is advised to run ./autogen.sh before ./configure (autoconf and automake
-need to be installed on your system for autogen.sh to work)
-
-./configure has an option named --with-cuda that allows you to specify
-where your CUDA 6.5 toolkit is installed (usually /usr/local/cuda,
-but some distros may have a different default location)
-
-
-** How to compile on Fedora 25 **
-
-Note: You may find an alternative method via rpms :
-see https://negativo17.org/nvidia-driver/ and https://negativo17.org/repos/multimedia/
-
-
-# Step 1: gcc and dependencies
-dnf install gcc gcc-c++ autoconf automake
-dnf install jansson-devel openssl-devel libcurl-devel zlib-devel
-
-# Step 2: nvidia drivers (Download common linux drivers from nvidia site)
-dnf install kernel-devel
-dnf install https://download1.rpmfusion.org/free/fedora/rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm https://download1.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm
-dnf check-update
-dnf install xorg-x11-drv-nvidia-cuda kmod-nvidia
-ln -s libnvidia-ml.so.1 /usr/lib64/libnvidia-ml.so
-
-# Step 3: CUDA SDK (Download from nvidia the generic ".run" archive)
-# --override is required to ignore "too recent" gcc 6.3
-# --silent is required to install only the toolkit (no kmod)
-./cuda_8.0.61_375.26_linux.run --toolkit --silent --override
-nvcc --version
-
-# add the nvcc binary path to the system
-ln -s /usr/local/cuda-8.0 /usr/local/cuda # (if not already made)
-echo 'export PATH=$PATH:/usr/local/cuda/bin' > /etc/profile.d/cuda.sh
-
-# add the cudart library path to the system
-echo /usr/local/cuda/lib64 > /etc/ld.so.conf.d/cuda.conf
-ldconfig
-
-# Step 4: Fix the toolkit incompatibility with gcc 6
-
-# You need to build yourself an older GCC/G++ version, i recommend the 5.4
-# see https://gcc.gnu.org/mirrors.html
-# Note: this manual method will override the default gcc, it could be better to use a custom toolchain prefix
-
-wget ftp://ftp.lip6.fr/pub/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2
-dnf install libmpc-devel mpfr-devel gmp-devel
-./configure --prefix=/usr/local --enable-languages=c,c++,lto --disable-multilib
-make -j 8 && make install
-(while this step, you have the time to cook something :p)
-
-# or, for previous fedora versions, edit the file /usr/local/cuda/include/host_config.h
-# and comment/delete the line 121 : #error -- unsupported GNU version! gcc versions later than 5 are not supported!
-
-./build.sh
-
-./ccminer -n
-
-
-** How to compile on macOS **
-
-# Step 1: download and install CUDA Toolkit 8 or more recent
-# https://developer.nvidia.com/cuda-toolkit-archive
-
-# Step 2: install Homebrew
-ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
-
-# Step 3: dependencies
-brew install pkg-config autoconf automake curl openssl llvm
-
-./build.sh
-
-./ccminer -n
-
+
+You can use ./build.sh to configure and build with default options.
+
+It is advised to run ./autogen.sh before ./configure (autoconf and automake
+need to be installed on your system for autogen.sh to work)
+
+./configure has an option named --with-cuda that allows you to specify
+where your CUDA 6.5 toolkit is installed (usually /usr/local/cuda,
+but some distros may have a different default location)
+
+
+** How to compile on Ubuntu (16.04 LTS)
+
+First, install Cuda toolkit and nVidia Driver, and type `nvidia-smi` to check if your card is detected.
+
+Install dependencies
+```sudo apt-get install libcurl4-openssl-dev libssl-dev libjansson-dev automake autotools-dev build-essential```
+
+Ubuntu is now shipped with gcc 6 or 7 so please install gcc/g++ 5 and make it the default (required by the cuda toolkit)
+```
+sudo apt-get install gcc-5 g++-5
+sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1
+```
+
+Then use the helper ./build.sh in ccminer source folder, edit configure.sh and the Makefile.am if required.
+```
+./build.sh
+./ccminer --version
+```
+
+
+** How to compile on Fedora 25 **
+
+Note: You may find an alternative method via rpms :
+see https://negativo17.org/nvidia-driver/ and https://negativo17.org/repos/multimedia/
+
+
+# Step 1: gcc and dependencies
+dnf install gcc gcc-c++ autoconf automake
+dnf install jansson-devel openssl-devel libcurl-devel zlib-devel
+
+# Step 2: nvidia drivers (Download common linux drivers from nvidia site)
+dnf install kernel-devel
+dnf install https://download1.rpmfusion.org/free/fedora/rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm https://download1.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm
+dnf check-update
+dnf install xorg-x11-drv-nvidia-cuda kmod-nvidia
+ln -s libnvidia-ml.so.1 /usr/lib64/libnvidia-ml.so
+
+# Step 3: CUDA SDK (Download from nvidia the generic ".run" archive)
+# --override is required to ignore "too recent" gcc 6.3
+# --silent is required to install only the toolkit (no kmod)
+./cuda_8.0.61_375.26_linux.run --toolkit --silent --override
+nvcc --version
+
+# add the nvcc binary path to the system
+ln -s /usr/local/cuda-8.0 /usr/local/cuda # (if not already made)
+echo 'export PATH=$PATH:/usr/local/cuda/bin' > /etc/profile.d/cuda.sh
+
+# add the cudart library path to the system
+echo /usr/local/cuda/lib64 > /etc/ld.so.conf.d/cuda.conf
+ldconfig
+
+# Step 4: Fix the toolkit incompatibility with gcc 6
+
+# You need to build yourself an older GCC/G++ version, i recommend the 5.4
+# see https://gcc.gnu.org/mirrors.html
+# Note: this manual method will override the default gcc, it could be better to use a custom toolchain prefix
+
+wget ftp://ftp.lip6.fr/pub/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2
+dnf install libmpc-devel mpfr-devel gmp-devel
+./configure --prefix=/usr/local --enable-languages=c,c++,lto --disable-multilib
+make -j 8 && make install
+(while this step, you have the time to cook something :p)
+
+# or, for previous fedora versions, edit the file /usr/local/cuda/include/host_config.h
+# and comment/delete the line 121 : #error -- unsupported GNU version! gcc versions later than 5 are not supported!
+
+./build.sh
+
+./ccminer -n
+
+
+** How to compile on macOS **
+
+# Step 1: download and install CUDA Toolkit 8 or more recent
+# https://developer.nvidia.com/cuda-toolkit-archive
+
+# Step 2: install Homebrew
+ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+
+# Step 3: dependencies
+brew install pkg-config autoconf automake curl openssl llvm
+
+./build.sh
+
+./ccminer -n
+
diff --git a/JHA/jha.cu b/JHA/jha.cu
index ec7895c10d..94172ff585 100644
--- a/JHA/jha.cu
+++ b/JHA/jha.cu
@@ -147,6 +147,9 @@ extern "C" int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, u
CUDA_LOG_ERROR();
}
cuda_get_arch(thr_id);
+ if (cuda_arch[dev_id] >= 500) {
+ applog(LOG_WARNING, "You are not using the optimal algo, please try -a jackpot");
+ }
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
diff --git a/Makefile.am b/Makefile.am
index cc7a1698d9..abf3115a1f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -66,14 +66,16 @@ ccminer_SOURCES = elist.h miner.h compat.h \
sph/ripemd.c sph/sph_sha2.c \
lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
- tribus.cu \
+ tribus/tribus.cu tribus/cuda_echo512_final.cu \
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \
x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
+ x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
+ x11/phi.cu x11/cuda_streebog_maxwell.cu \
x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
# scrypt
diff --git a/README.md b/README.md
index 001c4ecab5..5bcb50572c 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
-ccminer
-=======
+# ccminer
Based on Christian Buchner's & Christian H.'s CUDA project, no more active on github since 2014.
@@ -10,7 +9,7 @@ BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)
A part of the recent algos were originally written by [djm34](https://github.com/djm34) and [alexis78](https://github.com/alexis78)
This variant was tested and built on Linux (ubuntu server 14.04, 16.04, Fedora 22 to 25)
-It is also built for Windows 7 to 10 with VStudio 2013, to stay compatible with Windows Vista.
+It is also built for Windows 7 to 10 with VStudio 2013, to stay compatible with Windows 7 and Vista.
Note that the x86 releases are generally faster than x64 ones on Windows, but that tend to change with the recent drivers.
@@ -22,9 +21,7 @@ About source code dependencies
This project requires some libraries to be built :
- OpenSSL (prebuilt for win)
-
- Curl (prebuilt for win)
-
- pthreads (prebuilt for win)
The tree now contains recent prebuilt openssl and curl .lib for both x86 and x64 platforms (windows).
@@ -32,5 +29,8 @@ The tree now contains recent prebuilt openssl and curl .lib for both x86 and x64
To rebuild them, you need to clone this repository and its submodules :
git clone https://github.com/peters/curl-for-windows.git compat/curl-for-windows
-On Linux, you can use the helper ./build.sh (edit configure.sh and the Makefile.am if required)
+Compile on Linux
+----------------
+
+Please see [INSTALL](https://github.com/tpruvot/ccminer/blob/linux/INSTALL) file or [project Wiki](https://github.com/tpruvot/ccminer/wiki/Compatibility)
diff --git a/README.txt b/README.txt
index 0fa05f9b22..290ba605c1 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
-ccminer 2.2 (August 2017) "Equihash, tribus and optimized skunk"
+ccminer 2.2.2 (Oct. 2017) "phi and hsr algos"
---------------------------------------------------------------
***************************************************************
@@ -90,7 +90,8 @@ its command line interface and options.
fugue256 use to mine Fuguecoin
groestl use to mine Groestlcoin
heavy use to mine Heavycoin
- jha use to mine JackpotCoin
+ hsr use to mine Hshare
+ jackpot use to mine Sweepcoin
keccak use to mine Maxcoin
lbry use to mine LBRY Credits
luffa use to mine Joincoin
@@ -102,6 +103,7 @@ its command line interface and options.
neoscrypt use to mine FeatherCoin
nist5 use to mine TalkCoin
penta use to mine Joincoin / Pentablake
+ phi use to mine LUXCoin
quark use to mine Quarkcoin
qubit use to mine Qubit
scrypt use to mine Scrypt coins
@@ -155,6 +157,7 @@ its command line interface and options.
-T, --timeout=N network timeout, in seconds (default: 300)
-s, --scantime=N upper bound on time spent scanning current work when
long polling is unavailable, in seconds (default: 5)
+ --submit-stale ignore stale job checks, may create more rejected shares
-n, --ndevs list cuda devices
-N, --statsavg number of samples used to display hashrate (default: 30)
--no-gbt disable getblocktemplate support (height check in solo)
@@ -277,6 +280,16 @@ so we can more efficiently implement new algorithms using the latest hardware
features.
>>> RELEASE HISTORY <<<
+ Oct. 09th 2017 v2.2.2
+ Import and clean the hsr algo (x13 + custom hash)
+ Import and optimise phi algo from LuxCoin repository
+ Improve sib algo too for maxwell and pascal cards
+ Small fix to handle more than 9 cards on linux (-d 10+)
+ Attempt to free equihash memory "properly"
+ --submit-stale parameter for supernova pool (which change diff too fast)
+
+ Sep. 01st 2017 v2.2.1
+ Improve tribus algo on recent cards (up to +10%)
Aug. 13th 2017 v2.2
New skunk algo, using the heavy streebog algorithm
diff --git a/algos.h b/algos.h
index b7dd0f21fa..3c1528b326 100644
--- a/algos.h
+++ b/algos.h
@@ -22,6 +22,7 @@ enum sha_algos {
ALGO_GROESTL,
ALGO_HEAVY, /* Heavycoin hash */
ALGO_HMQ1725,
+ ALGO_HSR,
ALGO_KECCAK,
ALGO_JACKPOT,
ALGO_JHA,
@@ -35,6 +36,7 @@ enum sha_algos {
ALGO_NEOSCRYPT,
ALGO_NIST5,
ALGO_PENTABLAKE,
+ ALGO_PHI,
ALGO_QUARK,
ALGO_QUBIT,
ALGO_SCRYPT,
@@ -87,6 +89,7 @@ static const char *algo_names[] = {
"groestl",
"heavy",
"hmq1725",
+ "hsr",
"keccak",
"jackpot",
"jha",
@@ -100,6 +103,7 @@ static const char *algo_names[] = {
"neoscrypt",
"nist5",
"penta",
+ "phi",
"quark",
"qubit",
"scrypt",
@@ -161,12 +165,16 @@ static inline int algo_to_int(char* arg)
i = ALGO_LUFFA;
else if (!strcasecmp("hmq17", arg))
i = ALGO_HMQ1725;
+ else if (!strcasecmp("hshare", arg))
+ i = ALGO_HSR;
//else if (!strcasecmp("jackpot", arg))
// i = ALGO_JHA;
else if (!strcasecmp("lyra2re", arg))
i = ALGO_LYRA2;
else if (!strcasecmp("lyra2rev2", arg))
i = ALGO_LYRA2v2;
+ else if (!strcasecmp("phi1612", arg))
+ i = ALGO_PHI;
else if (!strcasecmp("bitcoin", arg))
i = ALGO_SHA256D;
else if (!strcasecmp("sha256", arg))
diff --git a/bench.cpp b/bench.cpp
index 8f32c40fd0..b1bb5bc87d 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -59,12 +59,14 @@ void algo_free_all(int thr_id)
free_cryptonight(thr_id);
free_decred(thr_id);
free_deep(thr_id);
+ free_equihash(thr_id);
free_keccak256(thr_id);
free_fresh(thr_id);
free_fugue256(thr_id);
free_groestlcoin(thr_id);
free_heavy(thr_id);
free_hmq17(thr_id);
+ free_hsr(thr_id);
free_jackpot(thr_id);
free_jha(thr_id);
free_lbry(thr_id);
@@ -76,6 +78,7 @@ void algo_free_all(int thr_id)
free_neoscrypt(thr_id);
free_nist5(thr_id);
free_pentablake(thr_id);
+ free_phi(thr_id);
free_quark(thr_id);
free_qubit(thr_id);
free_skeincoin(thr_id);
diff --git a/ccminer.cpp b/ccminer.cpp
index dbab501ac1..a39e78cd0f 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -98,7 +98,7 @@ bool allow_gbt = true;
bool allow_mininginfo = true;
bool check_dups = true; //false;
bool check_stratum_jobs = false;
-
+bool opt_submit_stale = false;
bool submit_old = false;
bool use_syslog = false;
bool use_colors = true;
@@ -263,6 +263,7 @@ Options:\n\
groestl Groestlcoin\n\
heavy Heavycoin\n\
hmq1725 Doubloons / Espers\n\
+ jackpot JHA v8\n\
jha JHA v8 (JackpotCoin)\n\
keccak Keccak-256 (Maxcoin)\n\
lbry LBRY Credits (Sha/Ripemd)\n\
@@ -275,6 +276,7 @@ Options:\n\
neoscrypt FeatherCoin, Phoenix, UFO...\n\
nist5 NIST5 (TalkCoin)\n\
penta Pentablake hash (5x Blake 512)\n\
+ phi BHCoin\n\
quark Quark\n\
qubit Qubit\n\
sha256d SHA256d (bitcoin)\n\
@@ -288,7 +290,7 @@ Options:\n\
skunk Skein Cube Fugue Streebog\n\
s3 S3 (1Coin)\n\
timetravel Machinecoin permuted x8\n\
- tribus Denerius\n\
+ tribus Denarius\n\
vanilla Blake256-8 (VNL)\n\
veltor Thorsriddle streebog\n\
whirlcoin Old Whirlcoin (Whirlpool algo)\n\
@@ -315,7 +317,7 @@ Options:\n\
--cuda-schedule Set device threads scheduling mode (default: auto)\n\
-f, --diff-factor Divide difficulty by this factor (default 1.0) \n\
-m, --diff-multiplier Multiply difficulty by this value (default 1.0) \n\
- --vote=VOTE vote (for HeavyCoin)\n\
+ --vote=VOTE vote (for decred and HeavyCoin)\n\
--trust-pool trust the max block reward vote (maxvote) sent by the pool\n\
-o, --url=URL URL of mining server\n\
-O, --userpass=U:P username:password pair for mining server\n\
@@ -331,7 +333,8 @@ Options:\n\
--time-limit maximum time [s] to mine before exiting the program.\n\
-T, --timeout=N network timeout, in seconds (default: 300)\n\
-s, --scantime=N upper bound on time spent scanning current work when\n\
- long polling is unavailable, in seconds (default: 10)\n"
+ long polling is unavailable, in seconds (default: 10)\n\
+ --submit-stale ignore stale jobs checks, may create more rejected shares\n\"
#ifndef ORG
"\
--segwit Agree with Segwit (Solo Mining only)\n"
@@ -455,7 +458,8 @@ struct option options[] = {
{ "retries", 1, NULL, 'r' },
{ "retry-pause", 1, NULL, 'R' },
{ "scantime", 1, NULL, 's' },
- { "show-diff", 0, NULL, 1013 },
+ { "show-diff", 0, NULL, 1013 }, // deprecated
+ { "submit-stale", 0, NULL, 1015 },
{ "hide-diff", 0, NULL, 1014 },
{ "statsavg", 1, NULL, 'N' },
{ "gpu-clock", 1, NULL, 1070 },
@@ -927,7 +931,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
/* discard if a newer block was received */
stale_work = work->height && work->height < g_work.height;
- if (have_stratum && !stale_work && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) {
+ if (have_stratum && !stale_work && !opt_submit_stale && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) {
pthread_mutex_lock(&g_work_lock);
if (strlen(work->job_id + 8))
stale_work = strncmp(work->job_id + 8, g_work.job_id + 8, sizeof(g_work.job_id) - 8);
@@ -1049,7 +1053,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
applog(LOG_DEBUG, "share diff: %.5f (x %.1f)",
stratum.sharediff, work->shareratio[idnonce]);
- if (opt_vote) { // ALGO_HEAVY
+ if (opt_vote) { // ALGO_HEAVY ALGO_DECRED
nvotestr = bin2hex((const uchar*)(&nvote), 2);
sprintf(s, "{\"method\": \"mining.submit\", \"params\": ["
"\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":%u}",
@@ -2684,7 +2688,9 @@ static void *miner_thread(void *userdata)
case ALGO_HEAVY:
case ALGO_JACKPOT:
case ALGO_JHA:
+ case ALGO_HSR:
case ALGO_LYRA2v2:
+ case ALGO_PHI:
case ALGO_S3:
case ALGO_SKUNK:
case ALGO_TIMETRAVEL:
@@ -2789,6 +2795,8 @@ static void *miner_thread(void *userdata)
rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_DECRED:
+ //applog(LOG_BLUE, "version %x, nbits %x, ntime %x extra %x",
+ // work.data[0], work.data[29], work.data[34], work.data[38]);
rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_DEEP:
@@ -2815,6 +2823,9 @@ static void *miner_thread(void *userdata)
case ALGO_HMQ1725:
rc = scanhash_hmq17(thr_id, &work, max_nonce, &hashes_done);
break;
+ case ALGO_HSR:
+ rc = scanhash_hsr(thr_id, &work, max_nonce, &hashes_done);
+ break;
case ALGO_HEAVY:
rc = scanhash_heavy(thr_id, &work, max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ);
@@ -2864,6 +2875,9 @@ static void *miner_thread(void *userdata)
case ALGO_PENTABLAKE:
rc = scanhash_pentablake(thr_id, &work, max_nonce, &hashes_done);
break;
+ case ALGO_PHI:
+ rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done);
+ break;
case ALGO_SCRYPT:
rc = scanhash_scrypt(thr_id, &work, max_nonce, &hashes_done,
NULL, &tv_start, &tv_end);
@@ -4118,6 +4132,9 @@ void parse_arg(int key, char *arg)
case 1014:
opt_showdiff = false;
break;
+ case 1015:
+ opt_submit_stale = true;
+ break;
case 'S':
case 1018:
applog(LOG_INFO, "Now logging to syslog...");
@@ -4186,10 +4203,10 @@ void parse_arg(int key, char *arg)
{
int device_thr[MAX_GPUS] = { 0 };
int ngpus = cuda_num_devices();
- char * pch = strtok (arg,",");
+ char* pch = strtok(arg,",");
opt_n_threads = 0;
while (pch != NULL && opt_n_threads < MAX_GPUS) {
- if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0')
+ if (pch[0] >= '0' && pch[0] <= '9' && strlen(pch) <= 2)
{
if (atoi(pch) < ngpus)
device_map[opt_n_threads++] = atoi(pch);
@@ -4440,7 +4457,7 @@ int main(int argc, char *argv[])
#endif
CUDART_VERSION/1000, (CUDART_VERSION % 1000)/10, arch);
printf(" Originally based on Christian Buchner and Christian H. project\n");
- printf(" Include some algos from alexis78, djm34, sp, tsiv and klausT.\n\n");
+ printf(" Include some kernels from alexis78, djm34, djEzo, tsiv and krnlx.\n\n");
printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)\n\n");
}
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 01d1aff5f8..f242c59d6f 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -331,6 +331,7 @@
+
@@ -539,7 +540,11 @@
64
-
+
+
+ compute_50,sm_50;compute_52,sm_52
+
+
@@ -556,8 +561,10 @@
64
+
+
@@ -571,6 +578,8 @@
+
+
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 37372b42ab..87e18010bc 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -109,6 +109,9 @@
{031afae7-2a78-4e32-9738-4b589b6f7ff3}
+
+ {1e548d79-c217-4203-989a-a592fe2b2de3}
+
@@ -222,6 +225,9 @@
Source Files\sph
+
+ Source Files\sph
+
Source Files
@@ -721,6 +727,12 @@
Source Files\CUDA\x13
+
+ Source Files\CUDA\x13
+
+
+ Source Files\CUDA\x13
+
Source Files\CUDA
@@ -763,8 +775,17 @@
Source Files\CUDA\skunk
-
- Source Files\CUDA
+
+ Source Files\CUDA\tribus
+
+
+ Source Files\CUDA\tribus
+
+
+ Source Files\CUDA\tribus
+
+
+ Source Files\CUDA\x11
Source Files\CUDA\x11
@@ -772,6 +793,9 @@
Source Files\CUDA\x11
+
+ Source Files\CUDA\x11
+
Source Files\CUDA\x11
diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h
index d07e736d21..7c280727b7 100644
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@@ -1,188 +1,188 @@
-/* CONFIG ONLY FOR MS VC++ BUILD */
-
-/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
- systems. This function is required for `alloca.c' support on those systems.
- */
-/* #undef CRAY_STACKSEG_END */
-
-/* Define to 1 if using `alloca.c'. */
-/* #undef C_ALLOCA */
-
-/* Define to 1 if you have `alloca', as a function or macro. */
-#define HAVE_ALLOCA 1
-
-/* Define to 1 if you have and it should be used (not on Ultrix).
- */
-#define HAVE_ALLOCA_H 1
-
-/* Define to 1 if you have the declaration of `be32dec', and to 0 if you
- don't. */
-#define HAVE_DECL_BE32DEC 0
-
-/* Define to 1 if you have the declaration of `be32enc', and to 0 if you
- don't. */
-#define HAVE_DECL_BE32ENC 0
-
-/* Define to 1 if you have the declaration of `le32dec', and to 0 if you
- don't. */
-#define HAVE_DECL_LE32DEC 0
-
-/* Define to 1 if you have the declaration of `le32enc', and to 0 if you
- don't. */
-#define HAVE_DECL_LE32ENC 0
-
-/* Define to 1 if you have the `getopt_long' function. */
-#define HAVE_GETOPT_LONG 1
-
-/* Define to 1 if you have the header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the `crypto' library (-lcrypto). */
-#define HAVE_LIBCRYPTO 1
-
-/* Define to 1 if you have a functional curl library. */
-#define HAVE_LIBCURL 1
-
-/* Define to 1 if you have the `ssl' library (-lssl). */
-#define HAVE_LIBSSL 1
-
-/* Define to 1 if you have the `z' library (-lz). */
-#define HAVE_LIBZ 1
-
-/* Define to 1 if you have the header file. */
-#define HAVE_MEMORY_H 1
-
-/* Define to 1 if you have the header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the header file. */
-/* #undef HAVE_SYSLOG_H */
-
-/* Define to 1 if you have the header file. */
-/* #undef HAVE_SYS_ENDIAN_H */
-
-/* Define to 1 if you have the header file. */
-#define HAVE_SYS_PARAM_H 1
-
-/* Define to 1 if you have the header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the header file. */
-#define HAVE_SYS_SYSCTL_H 1
-
-/* Define to 1 if you have the header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if you have the header file. */
-#define HAVE_UNISTD_H 1
-
-/* Defined if libcurl supports AsynchDNS */
-/* #undef LIBCURL_FEATURE_ASYNCHDNS */
-
-/* Defined if libcurl supports IDN */
-#define LIBCURL_FEATURE_IDN 1
-
-/* Defined if libcurl supports IPv6 */
-#define LIBCURL_FEATURE_IPV6 1
-
-/* Defined if libcurl supports KRB4 */
-/* #undef LIBCURL_FEATURE_KRB4 */
-
-/* Defined if libcurl supports libz */
-#define LIBCURL_FEATURE_LIBZ 1
-
-/* Defined if libcurl supports NTLM */
-#define LIBCURL_FEATURE_NTLM 1
-
-/* Defined if libcurl supports SSL */
-#define LIBCURL_FEATURE_SSL 1
-
-/* Defined if libcurl supports SSPI */
-/* #undef LIBCURL_FEATURE_SSPI */
-
-/* Defined if libcurl supports DICT */
-/* #undef LIBCURL_PROTOCOL_DICT */
-
-/* Defined if libcurl supports FILE */
-#define LIBCURL_PROTOCOL_FILE 1
-
-/* Defined if libcurl supports FTP */
-#define LIBCURL_PROTOCOL_FTP 1
-
-/* Defined if libcurl supports FTPS */
-#define LIBCURL_PROTOCOL_FTPS 1
-
-/* Defined if libcurl supports HTTP */
-#define LIBCURL_PROTOCOL_HTTP 1
-
-/* Defined if libcurl supports HTTPS */
-#define LIBCURL_PROTOCOL_HTTPS 1
-
-/* Defined if libcurl supports IMAP */
-/* #undef LIBCURL_PROTOCOL_IMAP */
-
-/* Defined if libcurl supports LDAP */
-/* #undef LIBCURL_PROTOCOL_LDAP */
-
-/* Defined if libcurl supports POP3 */
-/* #undef LIBCURL_PROTOCOL_POP3 */
-
-/* Defined if libcurl supports RTSP */
-/* #undef LIBCURL_PROTOCOL_RTSP */
-
-/* Defined if libcurl supports SMTP */
-/* #undef LIBCURL_PROTOCOL_SMTP */
-
-/* Defined if libcurl supports TELNET */
-/* #undef LIBCURL_PROTOCOL_TELNET */
-
-/* Defined if libcurl supports TFTP */
-/* #undef LIBCURL_PROTOCOL_TFTP */
-
-/* Define to 1 if your C compiler doesn't accept -c and -o together. */
-/* #undef NO_MINUS_C_MINUS_O */
-
-/* Name of package */
-#define PACKAGE "ccminer"
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT ""
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME "ccminer"
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL "http://github.com/tpruvot/ccminer"
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "2.2"
-
-/* If using the C implementation of alloca, define if you know the
- direction of stack growth for your system; otherwise it will be
- automatically deduced at runtime.
- STACK_DIRECTION > 0 => grows toward higher addresses
- STACK_DIRECTION < 0 => grows toward lower addresses
- STACK_DIRECTION = 0 => direction of growth unknown */
-/* #undef STACK_DIRECTION */
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
-
-/* Define curl_free() as free() if our version of curl lacks curl_free. */
-/* #undef curl_free */
-
-/* Define to `unsigned int' if does not define. */
-//#define size_t unsigned int
-
-#if !defined(HAVE_STRUCT_TIMESPEC) && _MSC_VER >= 1900
-#define HAVE_STRUCT_TIMESPEC
-#endif
+/* CONFIG ONLY FOR MS VC++ BUILD */
+
+/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
+ systems. This function is required for `alloca.c' support on those systems.
+ */
+/* #undef CRAY_STACKSEG_END */
+
+/* Define to 1 if using `alloca.c'. */
+/* #undef C_ALLOCA */
+
+/* Define to 1 if you have `alloca', as a function or macro. */
+#define HAVE_ALLOCA 1
+
+/* Define to 1 if you have and it should be used (not on Ultrix).
+ */
+#define HAVE_ALLOCA_H 1
+
+/* Define to 1 if you have the declaration of `be32dec', and to 0 if you
+ don't. */
+#define HAVE_DECL_BE32DEC 0
+
+/* Define to 1 if you have the declaration of `be32enc', and to 0 if you
+ don't. */
+#define HAVE_DECL_BE32ENC 0
+
+/* Define to 1 if you have the declaration of `le32dec', and to 0 if you
+ don't. */
+#define HAVE_DECL_LE32DEC 0
+
+/* Define to 1 if you have the declaration of `le32enc', and to 0 if you
+ don't. */
+#define HAVE_DECL_LE32ENC 0
+
+/* Define to 1 if you have the `getopt_long' function. */
+#define HAVE_GETOPT_LONG 1
+
+/* Define to 1 if you have the header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the `crypto' library (-lcrypto). */
+#define HAVE_LIBCRYPTO 1
+
+/* Define to 1 if you have a functional curl library. */
+#define HAVE_LIBCURL 1
+
+/* Define to 1 if you have the `ssl' library (-lssl). */
+#define HAVE_LIBSSL 1
+
+/* Define to 1 if you have the `z' library (-lz). */
+#define HAVE_LIBZ 1
+
+/* Define to 1 if you have the header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the header file. */
+/* #undef HAVE_SYSLOG_H */
+
+/* Define to 1 if you have the header file. */
+/* #undef HAVE_SYS_ENDIAN_H */
+
+/* Define to 1 if you have the header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the header file. */
+#define HAVE_SYS_SYSCTL_H 1
+
+/* Define to 1 if you have the header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the header file. */
+#define HAVE_UNISTD_H 1
+
+/* Defined if libcurl supports AsynchDNS */
+/* #undef LIBCURL_FEATURE_ASYNCHDNS */
+
+/* Defined if libcurl supports IDN */
+#define LIBCURL_FEATURE_IDN 1
+
+/* Defined if libcurl supports IPv6 */
+#define LIBCURL_FEATURE_IPV6 1
+
+/* Defined if libcurl supports KRB4 */
+/* #undef LIBCURL_FEATURE_KRB4 */
+
+/* Defined if libcurl supports libz */
+#define LIBCURL_FEATURE_LIBZ 1
+
+/* Defined if libcurl supports NTLM */
+#define LIBCURL_FEATURE_NTLM 1
+
+/* Defined if libcurl supports SSL */
+#define LIBCURL_FEATURE_SSL 1
+
+/* Defined if libcurl supports SSPI */
+/* #undef LIBCURL_FEATURE_SSPI */
+
+/* Defined if libcurl supports DICT */
+/* #undef LIBCURL_PROTOCOL_DICT */
+
+/* Defined if libcurl supports FILE */
+#define LIBCURL_PROTOCOL_FILE 1
+
+/* Defined if libcurl supports FTP */
+#define LIBCURL_PROTOCOL_FTP 1
+
+/* Defined if libcurl supports FTPS */
+#define LIBCURL_PROTOCOL_FTPS 1
+
+/* Defined if libcurl supports HTTP */
+#define LIBCURL_PROTOCOL_HTTP 1
+
+/* Defined if libcurl supports HTTPS */
+#define LIBCURL_PROTOCOL_HTTPS 1
+
+/* Defined if libcurl supports IMAP */
+/* #undef LIBCURL_PROTOCOL_IMAP */
+
+/* Defined if libcurl supports LDAP */
+/* #undef LIBCURL_PROTOCOL_LDAP */
+
+/* Defined if libcurl supports POP3 */
+/* #undef LIBCURL_PROTOCOL_POP3 */
+
+/* Defined if libcurl supports RTSP */
+/* #undef LIBCURL_PROTOCOL_RTSP */
+
+/* Defined if libcurl supports SMTP */
+/* #undef LIBCURL_PROTOCOL_SMTP */
+
+/* Defined if libcurl supports TELNET */
+/* #undef LIBCURL_PROTOCOL_TELNET */
+
+/* Defined if libcurl supports TFTP */
+/* #undef LIBCURL_PROTOCOL_TFTP */
+
+/* Define to 1 if your C compiler doesn't accept -c and -o together. */
+/* #undef NO_MINUS_C_MINUS_O */
+
+/* Name of package */
+#define PACKAGE "ccminer"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT ""
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "ccminer"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL "http://github.com/tpruvot/ccminer"
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "2.2.2"
+
+/* If using the C implementation of alloca, define if you know the
+ direction of stack growth for your system; otherwise it will be
+ automatically deduced at runtime.
+ STACK_DIRECTION > 0 => grows toward higher addresses
+ STACK_DIRECTION < 0 => grows toward lower addresses
+ STACK_DIRECTION = 0 => direction of growth unknown */
+/* #undef STACK_DIRECTION */
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define curl_free() as free() if our version of curl lacks curl_free. */
+/* #undef curl_free */
+
+/* Define to `unsigned int' if does not define. */
+//#define size_t unsigned int
+
+#if !defined(HAVE_STRUCT_TIMESPEC) && _MSC_VER >= 1900
+#define HAVE_STRUCT_TIMESPEC
+#endif
diff --git a/configure.ac b/configure.ac
index c369201a86..061dfdd70e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.2], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.2.2], [], [ccminer], [http://github.com/tpruvot/ccminer])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM
diff --git a/crypto/xmr-rpc.cpp b/crypto/xmr-rpc.cpp
index e50b28a96e..82b7845f27 100644
--- a/crypto/xmr-rpc.cpp
+++ b/crypto/xmr-rpc.cpp
@@ -30,6 +30,10 @@
#define MADV_HUGEPAGE 0
#endif
+#ifndef MADV_HUGEPAGE
+#define MADV_HUGEPAGE 0
+#endif
+
#ifndef PRIu64
#define PRIu64 "I64u"
#endif
diff --git a/equi/cuda_equi.cu b/equi/cuda_equi.cu
index 6fc864d307..b7bcbb5910 100644
--- a/equi/cuda_equi.cu
+++ b/equi/cuda_equi.cu
@@ -2081,7 +2081,7 @@ __host__ void eq_cuda_context::solve(const char *t
// destructor
template
__host__
-eq_cuda_context::~eq_cuda_context()
+void eq_cuda_context::freemem()
{
if (solutions)
free(solutions);
@@ -2104,6 +2104,12 @@ eq_cuda_context::~eq_cuda_context()
}
}
+template
+__host__
+eq_cuda_context::~eq_cuda_context()
+{
+ freemem();
+}
#ifdef CONFIG_MODE_1
template class eq_cuda_context;
diff --git a/equi/eqcuda.hpp b/equi/eqcuda.hpp
index 68bdaf0265..7cb10c9630 100644
--- a/equi/eqcuda.hpp
+++ b/equi/eqcuda.hpp
@@ -90,7 +90,7 @@ template struct equi;
struct eq_cuda_context_interface
{
- virtual ~eq_cuda_context_interface();
+ //virtual ~eq_cuda_context_interface();
virtual void solve(const char *tequihash_header,
unsigned int tequihash_header_len,
@@ -99,6 +99,7 @@ struct eq_cuda_context_interface
fn_cancel cancelf,
fn_solution solutionf,
fn_hashdone hashdonef);
+
public:
int thread_id;
int device_id;
@@ -125,9 +126,9 @@ class eq_cuda_context : public eq_cuda_context_interface
fn_cancel cancelf,
fn_solution solutionf,
fn_hashdone hashdonef);
-
public:
eq_cuda_context(int thr_id, int dev_id);
+ void freemem();
~eq_cuda_context();
};
diff --git a/equi/equihash.cpp b/equi/equihash.cpp
index 2a6e5141fe..c9ac1fcf30 100644
--- a/equi/equihash.cpp
+++ b/equi/equihash.cpp
@@ -281,7 +281,11 @@ void free_equihash(int thr_id)
if (!init[thr_id])
return;
- delete(solvers[thr_id]);
+ // assume config 1 was used... interface destructor seems bad
+ eq_cuda_context* ptr = dynamic_cast*>(solvers[thr_id]);
+ ptr->freemem();
+ ptr = NULL;
+
solvers[thr_id] = NULL;
init[thr_id] = false;
@@ -291,4 +295,3 @@ void free_equihash(int thr_id)
void eq_cuda_context_interface::solve(const char *tequihash_header, unsigned int tequihash_header_len,
const char* nonce, unsigned int nonce_len,
fn_cancel cancelf, fn_solution solutionf, fn_hashdone hashdonef) { }
-eq_cuda_context_interface::~eq_cuda_context_interface() { }
diff --git a/miner.h b/miner.h
index 382016a547..be5e34bf87 100644
--- a/miner.h
+++ b/miner.h
@@ -289,6 +289,7 @@ extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce,
extern int scanhash_groestlcoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_hmq17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_heavy(int thr_id,struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen);
+extern int scanhash_hsr(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_jha(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_jackpot(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); // quark method
extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -300,6 +301,7 @@ extern int scanhash_myriad(int thr_id, struct work* work, uint32_t max_nonce, un
extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_sha256d(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -350,6 +352,7 @@ extern void free_fugue256(int thr_id);
extern void free_groestlcoin(int thr_id);
extern void free_heavy(int thr_id);
extern void free_hmq17(int thr_id);
+extern void free_hsr(int thr_id);
extern void free_jackpot(int thr_id);
extern void free_jha(int thr_id);
extern void free_lbry(int thr_id);
@@ -361,6 +364,7 @@ extern void free_myriad(int thr_id);
extern void free_neoscrypt(int thr_id);
extern void free_nist5(int thr_id);
extern void free_pentablake(int thr_id);
+extern void free_phi(int thr_id);
extern void free_quark(int thr_id);
extern void free_qubit(int thr_id);
extern void free_sha256d(int thr_id);
@@ -897,6 +901,7 @@ void fresh_hash(void *state, const void *input);
void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
void hmq17hash(void *output, const void *input);
+void hsr_hash(void *output, const void *input);
void keccak256_hash(void *state, const void *input);
void jackpothash(void *state, const void *input);
void groestlhash(void *state, const void *input);
@@ -909,6 +914,7 @@ void myriadhash(void *state, const void *input);
void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
void nist5hash(void *state, const void *input);
void pentablakehash(void *output, const void *input);
+void phihash(void *output, const void *input);
void quarkhash(void *state, const void *input);
void qubithash(void *state, const void *input);
void scrypthash(void* output, const void* input);
diff --git a/res/ccminer.rc b/res/ccminer.rc
index d7cf1c2f5e..84be50d618 100644
--- a/res/ccminer.rc
+++ b/res/ccminer.rc
@@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico"
//
VS_VERSION_INFO VERSIONINFO
- FILEVERSION 2,2,0,0
- PRODUCTVERSION 2,2,0,0
+ FILEVERSION 2,2,2,0
+ PRODUCTVERSION 2,2,2,0
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x21L
@@ -76,10 +76,10 @@ BEGIN
BEGIN
BLOCK "040904e4"
BEGIN
- VALUE "FileVersion", "2.2"
+ VALUE "FileVersion", "2.2.2"
VALUE "LegalCopyright", "Copyright (C) 2017"
VALUE "ProductName", "ccminer"
- VALUE "ProductVersion", "2.2"
+ VALUE "ProductVersion", "2.2.2"
END
END
BLOCK "VarFileInfo"
diff --git a/skunk/cuda_skunk_streebog.cu b/skunk/cuda_skunk_streebog.cu
index c38de11d46..36ec7923c0 100644
--- a/skunk/cuda_skunk_streebog.cu
+++ b/skunk/cuda_skunk_streebog.cu
@@ -18,7 +18,7 @@
#include
#include
-#include "skunk/streebog_arrays.cuh"
+#include "x11/streebog_arrays.cuh"
//#define FULL_UNROLL
__device__ __forceinline__
@@ -204,7 +204,7 @@ static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uin
__constant__ uint64_t target64[4];
__host__
-void skunk_set_target(uint32_t* ptarget)
+void skunk_streebog_set_target(uint32_t* ptarget)
{
cudaMemcpyToSymbol(target64, ptarget, 4*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
}
diff --git a/skunk/skunk.cu b/skunk/skunk.cu
index f89c5fb458..c1add50303 100644
--- a/skunk/skunk.cu
+++ b/skunk/skunk.cu
@@ -23,12 +23,12 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t s
extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_fugue512_cpu_free(int thr_id);
-extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
-extern void streebog_set_target(const uint32_t* ptarget);
+extern void streebog_sm3_set_target(uint32_t* ptarget);
+extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
// krnlx merged kernel (for high-end cards only)
extern void skunk_cpu_init(int thr_id, uint32_t threads);
-extern void skunk_set_target(uint32_t* ptarget);
+extern void skunk_streebog_set_target(uint32_t* ptarget);
extern void skunk_setBlock_80(int thr_id, void *pdata);
extern void skunk_cuda_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
@@ -117,10 +117,10 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce,
cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
if (use_compat_kernels[thr_id]) {
skein512_cpu_setBlock_80(endiandata);
- streebog_set_target(ptarget);
+ streebog_sm3_set_target(ptarget);
} else {
skunk_setBlock_80(thr_id, endiandata);
- skunk_set_target(ptarget);
+ skunk_streebog_set_target(ptarget);
}
do {
@@ -129,7 +129,7 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce,
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
- streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+ streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
} else {
skunk_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
diff --git a/tribus/cuda_echo512_aes.cuh b/tribus/cuda_echo512_aes.cuh
new file mode 100644
index 0000000000..ff205aaeb7
--- /dev/null
+++ b/tribus/cuda_echo512_aes.cuh
@@ -0,0 +1,318 @@
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#define __CUDA_ARCH__ 520
+#include
+#endif
+
+#undef ROL8
+#undef ROR8
+#undef ROL16
+
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__
+uint32_t ROL8(const uint32_t a) {
+ return __byte_perm(a, 0, 0x2103);
+}
+__device__ __forceinline__
+uint32_t ROR8(const uint32_t a) {
+ return __byte_perm(a, 0, 0x0321);
+}
+__device__ __forceinline__
+uint32_t ROL16(const uint32_t a) {
+ return __byte_perm(a, 0, 0x1032);
+}
+#else
+#define ROL8(u) ROTL32(u, 8)
+#define ROR8(u) ROTR32(u, 8)
+#define ROL16(u) ROTL32(u,16)
+#endif
+
+__device__ uint32_t d_AES0[256] = {
+ 0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591,
+ 0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC,
+ 0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB,
+ 0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B,
+ 0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83,
+ 0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A,
+ 0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F,
+ 0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA,
+ 0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B,
+ 0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413,
+ 0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6,
+ 0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85,
+ 0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511,
+ 0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B,
+ 0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1,
+ 0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF,
+ 0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E,
+ 0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6,
+ 0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B,
+ 0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD,
+ 0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8,
+ 0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2,
+ 0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949,
+ 0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810,
+ 0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697,
+ 0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F,
+ 0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C,
+ 0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27,
+ 0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433,
+ 0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5,
+ 0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0,
+ 0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C
+};
+
+__device__ uint32_t d_AES3[256] = {
+ 0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5,
+ 0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B, 0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676,
+ 0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D, 0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0,
+ 0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF, 0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0,
+ 0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626, 0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC,
+ 0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1, 0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515,
+ 0x080C0404, 0x9552C7C7, 0x46652323, 0x9D5EC3C3, 0x30281818, 0x37A19696, 0x0A0F0505, 0x2FB59A9A,
+ 0x0E090707, 0x24361212, 0x1B9B8080, 0xDF3DE2E2, 0xCD26EBEB, 0x4E692727, 0x7FCDB2B2, 0xEA9F7575,
+ 0x121B0909, 0x1D9E8383, 0x58742C2C, 0x342E1A1A, 0x362D1B1B, 0xDCB26E6E, 0xB4EE5A5A, 0x5BFBA0A0,
+ 0xA4F65252, 0x764D3B3B, 0xB761D6D6, 0x7DCEB3B3, 0x527B2929, 0xDD3EE3E3, 0x5E712F2F, 0x13978484,
+ 0xA6F55353, 0xB968D1D1, 0x00000000, 0xC12CEDED, 0x40602020, 0xE31FFCFC, 0x79C8B1B1, 0xB6ED5B5B,
+ 0xD4BE6A6A, 0x8D46CBCB, 0x67D9BEBE, 0x724B3939, 0x94DE4A4A, 0x98D44C4C, 0xB0E85858, 0x854ACFCF,
+ 0xBB6BD0D0, 0xC52AEFEF, 0x4FE5AAAA, 0xED16FBFB, 0x86C54343, 0x9AD74D4D, 0x66553333, 0x11948585,
+ 0x8ACF4545, 0xE910F9F9, 0x04060202, 0xFE817F7F, 0xA0F05050, 0x78443C3C, 0x25BA9F9F, 0x4BE3A8A8,
+ 0xA2F35151, 0x5DFEA3A3, 0x80C04040, 0x058A8F8F, 0x3FAD9292, 0x21BC9D9D, 0x70483838, 0xF104F5F5,
+ 0x63DFBCBC, 0x77C1B6B6, 0xAF75DADA, 0x42632121, 0x20301010, 0xE51AFFFF, 0xFD0EF3F3, 0xBF6DD2D2,
+ 0x814CCDCD, 0x18140C0C, 0x26351313, 0xC32FECEC, 0xBEE15F5F, 0x35A29797, 0x88CC4444, 0x2E391717,
+ 0x9357C4C4, 0x55F2A7A7, 0xFC827E7E, 0x7A473D3D, 0xC8AC6464, 0xBAE75D5D, 0x322B1919, 0xE6957373,
+ 0xC0A06060, 0x19988181, 0x9ED14F4F, 0xA37FDCDC, 0x44662222, 0x547E2A2A, 0x3BAB9090, 0x0B838888,
+ 0x8CCA4646, 0xC729EEEE, 0x6BD3B8B8, 0x283C1414, 0xA779DEDE, 0xBCE25E5E, 0x161D0B0B, 0xAD76DBDB,
+ 0xDB3BE0E0, 0x64563232, 0x744E3A3A, 0x141E0A0A, 0x92DB4949, 0x0C0A0606, 0x486C2424, 0xB8E45C5C,
+ 0x9F5DC2C2, 0xBD6ED3D3, 0x43EFACAC, 0xC4A66262, 0x39A89191, 0x31A49595, 0xD337E4E4, 0xF28B7979,
+ 0xD532E7E7, 0x8B43C8C8, 0x6E593737, 0xDAB76D6D, 0x018C8D8D, 0xB164D5D5, 0x9CD24E4E, 0x49E0A9A9,
+ 0xD8B46C6C, 0xACFA5656, 0xF307F4F4, 0xCF25EAEA, 0xCAAF6565, 0xF48E7A7A, 0x47E9AEAE, 0x10180808,
+ 0x6FD5BABA, 0xF0887878, 0x4A6F2525, 0x5C722E2E, 0x38241C1C, 0x57F1A6A6, 0x73C7B4B4, 0x9751C6C6,
+ 0xCB23E8E8, 0xA17CDDDD, 0xE89C7474, 0x3E211F1F, 0x96DD4B4B, 0x61DCBDBD, 0x0D868B8B, 0x0F858A8A,
+ 0xE0907070, 0x7C423E3E, 0x71C4B5B5, 0xCCAA6666, 0x90D84848, 0x06050303, 0xF701F6F6, 0x1C120E0E,
+ 0xC2A36161, 0x6A5F3535, 0xAEF95757, 0x69D0B9B9, 0x17918686, 0x9958C1C1, 0x3A271D1D, 0x27B99E9E,
+ 0xD938E1E1, 0xEB13F8F8, 0x2BB39898, 0x22331111, 0xD2BB6969, 0xA970D9D9, 0x07898E8E, 0x33A79494,
+ 0x2DB69B9B, 0x3C221E1E, 0x15928787, 0xC920E9E9, 0x8749CECE, 0xAAFF5555, 0x50782828, 0xA57ADFDF,
+ 0x038F8C8C, 0x59F8A1A1, 0x09808989, 0x1A170D0D, 0x65DABFBF, 0xD731E6E6, 0x84C64242, 0xD0B86868,
+ 0x82C34141, 0x29B09999, 0x5A772D2D, 0x1E110F0F, 0x7BCBB0B0, 0xA8FC5454, 0x6DD6BBBB, 0x2C3A1616
+};
+
+__device__ __forceinline__
+void aes_gpu_init_mt_256(uint32_t sharedMemory[4][256])
+{
+ /* each thread startup will fill a uint32 */
+ if (threadIdx.x < 256) {
+ uint32_t temp = __ldg(&d_AES0[threadIdx.x]);
+ sharedMemory[0][threadIdx.x] = temp;
+ sharedMemory[1][threadIdx.x] = ROL8(temp);
+ sharedMemory[2][threadIdx.x] = ROL16(temp);
+ sharedMemory[3][threadIdx.x] = ROR8(temp);
+ }
+}
+
+__device__ __forceinline__
+void aes_gpu_init256(uint32_t sharedMemory[4][256])
+{
+ /* each thread startup will fill a uint32 */
+ uint32_t temp = __ldg(&d_AES0[threadIdx.x]);
+ sharedMemory[0][threadIdx.x] = temp;
+ sharedMemory[1][threadIdx.x] = ROL8(temp);
+ sharedMemory[2][threadIdx.x] = ROL16(temp);
+ sharedMemory[3][threadIdx.x] = ROR8(temp);
+}
+
+__device__ __forceinline__
+void aes_gpu_init128(uint32_t sharedMemory[4][256])
+{
+ /* each thread startup will fill 2 uint32 */
+ uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
+
+ sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x;
+ sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y;
+ sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x);
+ sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y);
+ sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x);
+ sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y);
+ sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x);
+ sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y);
+}
+
+__device__ __forceinline__
+void aes_gpu_init_lt_256(uint32_t sharedMemory[4][256])
+{
+ if (threadIdx.x < 128) {
+ /* each thread startup will fill 2 uint32 */
+ uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]);
+
+ sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x;
+ sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y;
+ sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x);
+ sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y);
+ sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x);
+ sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y);
+ sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x);
+ sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y);
+ }
+}
+
+__device__ __forceinline__
+static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
+ const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+ y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
+ y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
+ y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
+ y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+
+ y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
+ y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
+ y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
+#ifdef INTENSIVE_GMF
+ y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+#else
+ y2 ^= sharedMemory[3][__byte_perm(x1, 0, 0x4443)];
+#endif
+
+ y0 ^= k0;
+
+ y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
+ y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
+ y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
+ y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
+
+ y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
+ y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
+ y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
+ y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+}
+
+__device__ __forceinline__
+static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
+ const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+ y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]);
+ y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
+ y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
+ y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+
+ y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
+ y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
+ y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
+ y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+
+ y0 ^= k0;
+
+ y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]);
+ y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
+ y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
+ y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
+
+ y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]);
+ y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
+ y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
+ y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+}
+
+__device__ __forceinline__
+static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
+ uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+ y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)];
+ y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
+ y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
+ y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+
+#ifdef INTENSIVE_GMF
+ y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
+#else
+ y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)];
+#endif
+ y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
+ y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
+ y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+
+ y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)];
+ y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
+ y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
+ y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
+
+ y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
+ y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
+ y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
+ y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+}
+
+__device__ __forceinline__
+static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3,
+ uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3)
+{
+ y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)];
+ y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)];
+ y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)];
+ y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]);
+
+ y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]);
+ y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)];
+ y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)];
+ y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]);
+
+ y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)];
+ y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)];
+ y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)];
+ y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]);
+
+ y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)];
+ y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)];
+ y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)];
+ y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]);
+}
+
+__device__ __forceinline__
+static void AES_2ROUND(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0)
+{
+ uint32_t y0, y1, y2, y3;
+
+ aes_round(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3);
+
+ aes_round(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3);
+
+ // hier werden wir ein carry brauchen (oder auch nicht)
+ k0++;
+}
+
+__device__ __forceinline__
+static void AES_2ROUND_LDG(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0)
+{
+ uint32_t y0, y1, y2, y3;
+
+ aes_round_LDG(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3);
+
+ aes_round_LDG(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3);
+
+ // hier werden wir ein carry brauchen (oder auch nicht)
+ k0++;
+}
+
+__device__ __forceinline__
+static void AES_ROUND_NOKEY(const uint32_t sharedMemory[4][256], uint4* x)
+{
+ uint32_t y0, y1, y2, y3;
+ aes_round(sharedMemory, x->x, x->y, x->z, x->w, y0, y1, y2, y3);
+
+ x->x = y0;
+ x->y = y1;
+ x->z = y2;
+ x->w = y3;
+}
+
+__device__ __forceinline__
+static void KEY_EXPAND_ELT(const uint32_t sharedMemory[4][256], uint32_t *k)
+{
+ uint32_t y0, y1, y2, y3;
+ aes_round(sharedMemory, k[0], k[1], k[2], k[3], y0, y1, y2, y3);
+
+ k[0] = y1;
+ k[1] = y2;
+ k[2] = y3;
+ k[3] = y0;
+}
diff --git a/tribus/cuda_echo512_final.cu b/tribus/cuda_echo512_final.cu
new file mode 100644
index 0000000000..b68a9c6ea5
--- /dev/null
+++ b/tribus/cuda_echo512_final.cu
@@ -0,0 +1,285 @@
+/**
+ * Based on Provos Alexis work - 2016 FOR SM 5+
+ *
+ * final touch by tpruvot for tribus - 09 2017
+ */
+#include
+#include
+#include
+
+#define INTENSIVE_GMF
+#include "tribus/cuda_echo512_aes.cuh"
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#define atomicExch(p,y) (*p) = y
+#endif
+
+__device__
+static void echo_round(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0)
+{
+ // Big Sub Words
+ #pragma unroll 16
+ for (int idx = 0; idx < 16; idx++)
+ AES_2ROUND(sharedMemory,W[(idx<<2) + 0], W[(idx<<2) + 1], W[(idx<<2) + 2], W[(idx<<2) + 3], k0);
+
+ // Shift Rows
+ #pragma unroll 4
+ for (int i = 0; i < 4; i++)
+ {
+ uint32_t t[4];
+ /// 1, 5, 9, 13
+ t[0] = W[i + 4];
+ t[1] = W[i + 8];
+ t[2] = W[i + 24];
+ t[3] = W[i + 60];
+
+ W[i + 4] = W[i + 20];
+ W[i + 8] = W[i + 40];
+ W[i + 24] = W[i + 56];
+ W[i + 60] = W[i + 44];
+
+ W[i + 20] = W[i + 36];
+ W[i + 40] = t[1];
+ W[i + 56] = t[2];
+ W[i + 44] = W[i + 28];
+
+ W[i + 28] = W[i + 12];
+ W[i + 12] = t[3];
+ W[i + 36] = W[i + 52];
+ W[i + 52] = t[0];
+ }
+ // Mix Columns
+ #pragma unroll 4
+ for (int i = 0; i < 4; i++)
+ {
+ #pragma unroll 4
+ for (int idx = 0; idx < 64; idx += 16)
+ {
+ uint32_t a[4];
+ a[0] = W[idx + i];
+ a[1] = W[idx + i + 4];
+ a[2] = W[idx + i + 8];
+ a[3] = W[idx + i +12];
+
+ uint32_t ab = a[0] ^ a[1];
+ uint32_t bc = a[1] ^ a[2];
+ uint32_t cd = a[2] ^ a[3];
+
+ uint32_t t, t2, t3;
+ t = (ab & 0x80808080);
+ t2 = (bc & 0x80808080);
+ t3 = (cd & 0x80808080);
+
+ uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1);
+ uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+ uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+ W[idx + i] = bc ^ a[3] ^ abx;
+ W[idx + i + 4] = a[0] ^ cd ^ bcx;
+ W[idx + i + 8] = ab ^ a[3] ^ cdx;
+ W[idx + i +12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
+ }
+ }
+}
+
+__global__ __launch_bounds__(256, 3) /* will force 80 registers */
+static void tribus_echo512_gpu_final(uint32_t threads, uint64_t *g_hash, uint32_t* resNonce, const uint64_t target)
+{
+ __shared__ uint32_t sharedMemory[4][256];
+
+ aes_gpu_init256(sharedMemory);
+
+ const uint32_t P[48] = {
+ 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+ //8-12
+ 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+ //21-25
+ 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751,0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+ //34-38
+ 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7,0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+ 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968,
+ 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
+ //58-61
+ };
+ uint32_t k0;
+ uint32_t h[16];
+
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ const uint32_t *hash = (uint32_t*)&g_hash[thread<<3];
+
+ *(uint2x4*)&h[0] = __ldg4((uint2x4*)&hash[0]);
+ *(uint2x4*)&h[8] = __ldg4((uint2x4*)&hash[8]);
+
+ uint64_t backup = *(uint64_t*)&h[6];
+
+ k0 = 512 + 8;
+
+ #pragma unroll 4
+ for (uint32_t idx = 0; idx < 16; idx += 4)
+ AES_2ROUND(sharedMemory,h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0);
+
+ k0 += 4;
+
+ uint32_t W[64];
+
+ #pragma unroll 4
+ for (uint32_t i = 0; i < 4; i++)
+ {
+ uint32_t a = P[i];
+ uint32_t b = P[i + 4];
+ uint32_t c = h[i + 8];
+ uint32_t d = P[i + 8];
+
+ uint32_t ab = a ^ b;
+ uint32_t bc = b ^ c;
+ uint32_t cd = c ^ d;
+
+ uint32_t t = ((a ^ b) & 0x80808080);
+ uint32_t t2 = ((b ^ c) & 0x80808080);
+ uint32_t t3 = ((c ^ d) & 0x80808080);
+
+ uint32_t abx = ((t >> 7) * 27U) ^ ((ab^t) << 1);
+ uint32_t bcx = ((t2 >> 7) * 27U) ^ ((bc^t2) << 1);
+ uint32_t cdx = ((t3 >> 7) * 27U) ^ ((cd^t3) << 1);
+
+ W[0 + i] = bc ^ d ^ abx;
+ W[4 + i] = a ^ cd ^ bcx;
+ W[8 + i] = ab ^ d ^ cdx;
+ W[12+ i] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+ a = P[12 + i];
+ b = h[i + 4];
+ c = P[12 + i + 4];
+ d = P[12 + i + 8];
+
+ ab = a ^ b;
+ bc = b ^ c;
+ cd = c ^ d;
+
+ t = (ab & 0x80808080);
+ t2 = (bc & 0x80808080);
+ t3 = (cd & 0x80808080);
+
+ abx = (t >> 7) * 27U ^ ((ab^t) << 1);
+ bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+ cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+ W[16 + i] = abx ^ bc ^ d;
+ W[16 + i + 4] = bcx ^ a ^ cd;
+ W[16 + i + 8] = cdx ^ ab ^ d;
+ W[16 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+ a = h[i];
+ b = P[24 + i];
+ c = P[24 + i + 4];
+ d = P[24 + i + 8];
+
+ ab = a ^ b;
+ bc = b ^ c;
+ cd = c ^ d;
+
+ t = (ab & 0x80808080);
+ t2 = (bc & 0x80808080);
+ t3 = (cd & 0x80808080);
+
+ abx = (t >> 7) * 27U ^ ((ab^t) << 1);
+ bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+ cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+ W[32 + i] = abx ^ bc ^ d;
+ W[32 + i + 4] = bcx ^ a ^ cd;
+ W[32 + i + 8] = cdx ^ ab ^ d;
+ W[32 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+ a = P[36 + i ];
+ b = P[36 + i + 4];
+ c = P[36 + i + 8];
+ d = h[i + 12];
+
+ ab = a ^ b;
+ bc = b ^ c;
+ cd = c ^ d;
+
+ t = (ab & 0x80808080);
+ t2 = (bc & 0x80808080);
+ t3 = (cd & 0x80808080);
+
+ abx = (t >> 7) * 27U ^ ((ab^t) << 1);
+ bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+ cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+ W[48 + i] = abx ^ bc ^ d;
+ W[48 + i + 4] = bcx ^ a ^ cd;
+ W[48 + i + 8] = cdx ^ ab ^ d;
+ W[48 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
+ }
+
+ for (int k = 1; k < 9; k++)
+ echo_round(sharedMemory,W,k0);
+
+ // Big Sub Words
+ uint32_t y0, y1, y2, y3;
+// AES_2ROUND(sharedMemory,W[ 0], W[ 1], W[ 2], W[ 3], k0);
+ aes_round(sharedMemory, W[ 0], W[ 1], W[ 2], W[ 3], k0, y0, y1, y2, y3);
+ aes_round(sharedMemory, y0, y1, y2, y3, W[ 0], W[ 1], W[ 2], W[ 3]);
+
+ aes_round(sharedMemory, W[ 4], W[ 5], W[ 6], W[ 7], k0, y0, y1, y2, y3);
+ aes_round(sharedMemory, y0, y1, y2, y3, W[ 4], W[ 5], W[ 6], W[ 7]);
+ aes_round(sharedMemory, W[ 8], W[ 9], W[10], W[11], k0, y0, y1, y2, y3);
+ aes_round(sharedMemory, y0, y1, y2, y3, W[ 8], W[ 9], W[10], W[11]);
+
+ aes_round(sharedMemory, W[20], W[21], W[22], W[23], k0, y0, y1, y2, y3);
+ aes_round(sharedMemory, y0, y1, y2, y3, W[20], W[21], W[22], W[23]);
+ aes_round(sharedMemory, W[28], W[29], W[30], W[31], k0, y0, y1, y2, y3);
+ aes_round(sharedMemory, y0, y1, y2, y3, W[28], W[29], W[30], W[31]);
+
+ aes_round(sharedMemory, W[32], W[33], W[34], W[35], k0, y0, y1, y2, y3);
+ aes_round(sharedMemory, y0, y1, y2, y3, W[32], W[33], W[34], W[35]);
+ aes_round(sharedMemory, W[40], W[41], W[42], W[43], k0, y0, y1, y2, y3);
+ aes_round(sharedMemory, y0, y1, y2, y3, W[40], W[41], W[42], W[43]);
+
+ aes_round(sharedMemory, W[52], W[53], W[54], W[55], k0, y0, y1, y2, y3);
+ aes_round(sharedMemory, y0, y1, y2, y3, W[52], W[53], W[54], W[55]);
+ aes_round(sharedMemory, W[60], W[61], W[62], W[63], k0, y0, y1, y2, y3);
+ aes_round(sharedMemory, y0, y1, y2, y3, W[60], W[61], W[62], W[63]);
+
+ uint32_t bc = W[22] ^ W[42];
+ uint32_t t2 = (bc & 0x80808080);
+ W[ 6] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+
+ bc = W[23] ^ W[43];
+ t2 = (bc & 0x80808080);
+ W[ 7] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+
+ bc = W[10] ^ W[54];
+ t2 = (bc & 0x80808080);
+ W[38] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+
+ bc = W[11] ^ W[55];
+ t2 = (bc & 0x80808080);
+ W[39] = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+
+ uint64_t check = backup ^ *(uint64_t*)&W[2] ^ *(uint64_t*)&W[6] ^ *(uint64_t*)&W[10] ^ *(uint64_t*)&W[30]
+ ^ *(uint64_t*)&W[34] ^ *(uint64_t*)&W[38] ^ *(uint64_t*)&W[42] ^ *(uint64_t*)&W[62];
+
+ if(check <= target){
+ uint32_t tmp = atomicExch(&resNonce[0], thread);
+ if (tmp != UINT32_MAX)
+ resNonce[1] = tmp;
+ }
+ }
+}
+
+__host__
+void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target)
+{
+ const uint32_t threadsperblock = 256;
+
+ dim3 grid((threads + threadsperblock-1)/threadsperblock);
+ dim3 block(threadsperblock);
+
+ tribus_echo512_gpu_final <<>> (threads, (uint64_t*)d_hash, d_resNonce, target);
+}
diff --git a/tribus.cu b/tribus/tribus.cu
similarity index 71%
rename from tribus.cu
rename to tribus/tribus.cu
index ed82850085..4516e7d69c 100644
--- a/tribus.cu
+++ b/tribus/tribus.cu
@@ -1,7 +1,7 @@
/**
* Tribus Algo for Denarius
*
- * tpruvot@github 06 2017 - GPLv3
+ * tpruvot@github 09 2017 - GPLv3
*
*/
extern "C" {
@@ -16,9 +16,10 @@ extern "C" {
void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
static uint32_t *d_hash[MAX_GPUS];
-
+static uint32_t *d_resNonce[MAX_GPUS];
// cpu hash
@@ -46,6 +47,7 @@ extern "C" void tribus_hash(void *state, const void *input)
}
static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
{
@@ -63,7 +65,8 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
if (!init[thr_id])
{
- cudaSetDevice(device_map[thr_id]);
+ int dev_id = device_map[thr_id];
+ cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
@@ -74,10 +77,15 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
quark_jh512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
- x11_echo512_cpu_init(thr_id, throughput);
+
+ cuda_get_arch(thr_id);
+ use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+ if (use_compat_kernels[thr_id])
+ x11_echo512_cpu_init(thr_id, throughput);
// char[64] work space for hashes results
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput));
+ CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
@@ -87,33 +95,43 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
be32enc(&endiandata[k], pdata[k]);
jh512_setBlock_80(thr_id, endiandata);
- cuda_check_cpu_setTarget(ptarget);
+ if (use_compat_kernels[thr_id])
+ cuda_check_cpu_setTarget(ptarget);
+ else
+ cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
work->valid_nonces = 0;
do {
int order = 1;
-
- // Hash with CUDA
jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
- x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+ if (use_compat_kernels[thr_id]) {
+ x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+ work->nonces[1] = UINT32_MAX;
+ } else {
+ tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
+ cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+ }
*hashes_done = pdata[19] - first_nonce + throughput;
- work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (work->nonces[0] != UINT32_MAX)
{
- const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
+ const uint32_t Htarg = ptarget[7];
+ const uint32_t startNounce = pdata[19];
+ if (!use_compat_kernels[thr_id]) work->nonces[0] += startNounce;
be32enc(&endiandata[19], work->nonces[0]);
tribus_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
- work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
- if (work->nonces[1] != 0) {
+ if (work->nonces[1] != UINT32_MAX) {
+ work->nonces[1] += startNounce;
be32enc(&endiandata[19], work->nonces[1]);
tribus_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
@@ -127,7 +145,8 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
else if (vhash[7] > Htarg) {
gpu_increment_reject(thr_id);
if (!opt_quiet)
- gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+ gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+ cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
pdata[19] = work->nonces[0] + 1;
continue;
}
@@ -144,7 +163,6 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce
out:
// *hashes_done = pdata[19] - first_nonce;
-
return work->valid_nonces;
}
@@ -157,8 +175,8 @@ extern "C" void free_tribus(int thr_id)
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
+ cudaFree(d_resNonce[thr_id]);
- quark_groestl512_cpu_free(thr_id);
cuda_check_cpu_free(thr_id);
init[thr_id] = false;
diff --git a/util.cpp b/util.cpp
index 2c6a1e9469..5b87d9652a 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2211,6 +2211,9 @@ void print_hash_tests(void)
hmq17hash(&hash[0], &buf[0]);
printpfx("hmq1725", hash);
+ hsr_hash(&hash[0], &buf[0]);
+ printpfx("hsr", hash);
+
jha_hash(&hash[0], &buf[0]);
printpfx("jha", hash);
@@ -2245,6 +2248,9 @@ void print_hash_tests(void)
pentablakehash(&hash[0], &buf[0]);
printpfx("pentablake", hash);
+ phihash(&hash[0], &buf[0]);
+ printpfx("phi", hash);
+
quarkhash(&hash[0], &buf[0]);
printpfx("quark", hash);
diff --git a/x11/c11.cu b/x11/c11.cu
index 5dee17455a..8f8f6663b2 100644
--- a/x11/c11.cu
+++ b/x11/c11.cu
@@ -18,10 +18,13 @@ extern "C"
#include "cuda_helper.h"
#include "cuda_x11.h"
+void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
+
#include
#include
static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_resNonce[MAX_GPUS];
// Flax/Chaincoin C11 CPU Hash
extern "C" void c11hash(void *output, const void *input)
@@ -103,6 +106,7 @@ extern "C" void c11hash(void *output, const void *input)
#endif
static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
@@ -118,7 +122,8 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
if (!init[thr_id])
{
- cudaSetDevice(device_map[thr_id]);
+ int dev_id = device_map[thr_id];
+ cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
@@ -127,6 +132,9 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+ cuda_get_arch(thr_id);
+ use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
quark_blake512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
@@ -135,11 +143,13 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
quark_jh512_cpu_init(thr_id, throughput);
x11_luffaCubehash512_cpu_init(thr_id, throughput);
x11_shavite512_cpu_init(thr_id, throughput);
- x11_echo512_cpu_init(thr_id, throughput);
+ if (use_compat_kernels[thr_id])
+ x11_echo512_cpu_init(thr_id, throughput);
if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
return 0;
}
- CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); // why 64 ?
+ CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0);
+ CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
cuda_check_cpu_init(thr_id, throughput);
@@ -151,7 +161,10 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
be32enc(&endiandata[k], pdata[k]);
quark_blake512_cpu_setBlock_80(thr_id, endiandata);
- cuda_check_cpu_setTarget(ptarget);
+ if (use_compat_kernels[thr_id])
+ cuda_check_cpu_setTarget(ptarget);
+ else
+ cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
do {
int order = 0;
@@ -175,24 +188,32 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
TRACE("shavite:");
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("simd :");
- x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
- TRACE("echo => ");
+
+ if (use_compat_kernels[thr_id]) {
+ x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+ work->nonces[1] = UINT32_MAX;
+ } else {
+ tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
+ cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+ }
*hashes_done = pdata[19] - first_nonce + throughput;
- work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (work->nonces[0] != UINT32_MAX)
{
- const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
+ const uint32_t Htarg = ptarget[7];
+ const uint32_t startNounce = pdata[19];
+ if (!use_compat_kernels[thr_id]) work->nonces[0] += startNounce;
be32enc(&endiandata[19], work->nonces[0]);
c11hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work_set_target_ratio(work, vhash);
- work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
- if (work->nonces[1] != 0) {
+ if (work->nonces[1] != UINT32_MAX) {
+ work->nonces[1] += startNounce;
be32enc(&endiandata[19], work->nonces[1]);
c11hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
@@ -206,7 +227,8 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
else if (vhash[7] > Htarg) {
gpu_increment_reject(thr_id);
if (!opt_quiet)
- gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+ gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+ cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
pdata[19] = work->nonces[0] + 1;
continue;
}
@@ -234,6 +256,8 @@ extern "C" void free_c11(int thr_id)
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
+ cudaFree(d_resNonce[thr_id]);
+
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
x11_simd512_cpu_free(thr_id);
diff --git a/x11/cuda_streebog.cu b/x11/cuda_streebog.cu
index 228c691312..d6e3685e62 100644
--- a/x11/cuda_streebog.cu
+++ b/x11/cuda_streebog.cu
@@ -806,10 +806,11 @@ void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
#define T6(x) shared[6][x]
#define T7(x) shared[7][x]
+// Streebog final for Veltor and skunk on SM 3.x
__constant__ uint64_t target64[4];
__host__
-void streebog_set_target(const uint32_t* ptarget)
+void streebog_sm3_set_target(uint32_t* ptarget)
{
cudaMemcpyToSymbol(target64,ptarget,4*sizeof(uint64_t),0,cudaMemcpyHostToDevice);
}
@@ -995,7 +996,7 @@ void streebog_gpu_hash_64_final(uint64_t *g_hash, uint32_t* resNonce)
}
__host__
-void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce)
+void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce)
{
dim3 grid((threads + TPB-1) / TPB);
dim3 block(TPB);
diff --git a/x11/cuda_streebog_maxwell.cu b/x11/cuda_streebog_maxwell.cu
new file mode 100644
index 0000000000..6a06332933
--- /dev/null
+++ b/x11/cuda_streebog_maxwell.cu
@@ -0,0 +1,309 @@
+/*
+ * Streebog GOST R 34.10-2012 CUDA implementation.
+ *
+ * https://tools.ietf.org/html/rfc6986
+ * https://en.wikipedia.org/wiki/Streebog
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * @author Tanguy Pruvot - 2015
+ * @author Alexis Provos - 2016
+ */
+
+// Further improved with shared memory partial utilization
+// Tested under CUDA7.5 toolkit for cp 5.0/5.2
+
+//#include
+#include
+#include
+#include
+
+#include "streebog_arrays.cuh"
+
+//#define FULL_UNROLL
+__device__ __forceinline__
+static void GOST_FS(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state)
+{
+ return_state[0] = __ldg(&T02[__byte_perm(state[7].x,0,0x44440)])
+ ^ shared[1][__byte_perm(state[6].x,0,0x44440)]
+ ^ shared[2][__byte_perm(state[5].x,0,0x44440)]
+ ^ shared[3][__byte_perm(state[4].x,0,0x44440)]
+ ^ shared[4][__byte_perm(state[3].x,0,0x44440)]
+ ^ shared[5][__byte_perm(state[2].x,0,0x44440)]
+ ^ shared[6][__byte_perm(state[1].x,0,0x44440)]
+ ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]);
+
+ return_state[1] = __ldg(&T02[__byte_perm(state[7].x,0,0x44441)])
+ ^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)])
+ ^ shared[2][__byte_perm(state[5].x,0,0x44441)]
+ ^ shared[3][__byte_perm(state[4].x,0,0x44441)]
+ ^ shared[4][__byte_perm(state[3].x,0,0x44441)]
+ ^ shared[5][__byte_perm(state[2].x,0,0x44441)]
+ ^ shared[6][__byte_perm(state[1].x,0,0x44441)]
+ ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)]);
+
+ return_state[2] = __ldg(&T02[__byte_perm(state[7].x,0,0x44442)])
+ ^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)])
+ ^ shared[2][__byte_perm(state[5].x,0,0x44442)]
+ ^ shared[3][__byte_perm(state[4].x,0,0x44442)]
+ ^ shared[4][__byte_perm(state[3].x,0,0x44442)]
+ ^ shared[5][__byte_perm(state[2].x,0,0x44442)]
+ ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)])
+ ^ shared[6][__byte_perm(state[1].x,0,0x44442)];
+
+ return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)])
+ ^ shared[1][__byte_perm(state[6].x,0,0x44443)]
+ ^ shared[2][__byte_perm(state[5].x,0,0x44443)]
+ ^ shared[3][__byte_perm(state[4].x,0,0x44443)]
+ ^ __ldg(&T42[__byte_perm(state[3].x,0,0x44443)])
+ ^ shared[5][__byte_perm(state[2].x,0,0x44443)]
+ ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)])
+ ^ shared[6][__byte_perm(state[1].x,0,0x44443)];
+
+ return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)])
+ ^ shared[1][__byte_perm(state[6].y,0,0x44440)]
+ ^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)])
+ ^ shared[3][__byte_perm(state[4].y,0,0x44440)]
+ ^ shared[4][__byte_perm(state[3].y,0,0x44440)]
+ ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)])
+ ^ shared[5][__byte_perm(state[2].y,0,0x44440)]
+ ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)]);
+
+ return_state[5] = __ldg(&T02[__byte_perm(state[7].y,0,0x44441)])
+ ^ shared[2][__byte_perm(state[5].y,0,0x44441)]
+ ^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)])
+ ^ shared[3][__byte_perm(state[4].y,0,0x44441)]
+ ^ shared[4][__byte_perm(state[3].y,0,0x44441)]
+ ^ shared[5][__byte_perm(state[2].y,0,0x44441)]
+ ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)])
+ ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)]);
+
+ return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)])
+ ^ shared[1][__byte_perm(state[6].y,0,0x44442)]
+ ^ shared[2][__byte_perm(state[5].y,0,0x44442)]
+ ^ shared[3][__byte_perm(state[4].y,0,0x44442)]
+ ^ shared[4][__byte_perm(state[3].y,0,0x44442)]
+ ^ shared[5][__byte_perm(state[2].y,0,0x44442)]
+ ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)])
+ ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)]);
+
+ return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)])
+ ^ __ldg(&T12[__byte_perm(state[6].y,0,0x44443)])
+ ^ shared[2][__byte_perm(state[5].y,0,0x44443)]
+ ^ shared[3][__byte_perm(state[4].y,0,0x44443)]
+ ^ shared[4][__byte_perm(state[3].y,0,0x44443)]
+ ^ shared[5][__byte_perm(state[2].y,0,0x44443)]
+ ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)])
+ ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)]);
+}
+
+__device__ __forceinline__
+static void GOST_FS_LDG(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state)
+{
+ return_state[0] = __ldg(&T02[__byte_perm(state[7].x,0,0x44440)])
+ ^ __ldg(&T12[__byte_perm(state[6].x,0,0x44440)])
+ ^ shared[2][__byte_perm(state[5].x,0,0x44440)]
+ ^ shared[3][__byte_perm(state[4].x,0,0x44440)]
+ ^ shared[4][__byte_perm(state[3].x,0,0x44440)]
+ ^ shared[5][__byte_perm(state[2].x,0,0x44440)]
+ ^ shared[6][__byte_perm(state[1].x,0,0x44440)]
+ ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]);
+
+ return_state[1] = __ldg(&T02[__byte_perm(state[7].x,0,0x44441)])
+ ^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)])
+ ^ shared[2][__byte_perm(state[5].x,0,0x44441)]
+ ^ shared[3][__byte_perm(state[4].x,0,0x44441)]
+ ^ shared[4][__byte_perm(state[3].x,0,0x44441)]
+ ^ shared[5][__byte_perm(state[2].x,0,0x44441)]
+ ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)])
+ ^ shared[6][__byte_perm(state[1].x,0,0x44441)];
+
+ return_state[2] = __ldg(&T02[__byte_perm(state[7].x,0,0x44442)])
+ ^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)])
+ ^ shared[2][__byte_perm(state[5].x,0,0x44442)]
+ ^ shared[3][__byte_perm(state[4].x,0,0x44442)]
+ ^ shared[4][__byte_perm(state[3].x,0,0x44442)]
+ ^ shared[5][__byte_perm(state[2].x,0,0x44442)]
+ ^ shared[6][__byte_perm(state[1].x,0,0x44442)]
+ ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)]);
+
+ return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)])
+ ^ __ldg(&T12[__byte_perm(state[6].x,0,0x44443)])
+ ^ shared[2][__byte_perm(state[5].x,0,0x44443)]
+ ^ shared[3][__byte_perm(state[4].x,0,0x44443)]
+ ^ shared[4][__byte_perm(state[3].x,0,0x44443)]
+ ^ shared[5][__byte_perm(state[2].x,0,0x44443)]
+ ^ shared[6][__byte_perm(state[1].x,0,0x44443)]
+ ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)]);
+
+ return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)])
+ ^ shared[1][__byte_perm(state[6].y,0,0x44440)]
+ ^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)])
+ ^ shared[3][__byte_perm(state[4].y,0,0x44440)]
+ ^ shared[4][__byte_perm(state[3].y,0,0x44440)]
+ ^ shared[5][__byte_perm(state[2].y,0,0x44440)]
+ ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)])
+ ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)]);
+
+ return_state[5] = __ldg(&T02[__byte_perm(state[7].y,0,0x44441)])
+ ^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)])
+ ^ shared[2][__byte_perm(state[5].y,0,0x44441)]
+ ^ shared[3][__byte_perm(state[4].y,0,0x44441)]
+ ^ shared[4][__byte_perm(state[3].y,0,0x44441)]
+ ^ shared[5][__byte_perm(state[2].y,0,0x44441)]
+ ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)])
+ ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)]);
+
+ return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)])
+ ^ __ldg(&T12[__byte_perm(state[6].y,0,0x44442)])
+ ^ __ldg(&T22[__byte_perm(state[5].y,0,0x44442)])
+ ^ shared[3][__byte_perm(state[4].y,0,0x44442)]
+ ^ shared[4][__byte_perm(state[3].y,0,0x44442)]
+ ^ shared[5][__byte_perm(state[2].y,0,0x44442)]
+ ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)])
+ ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)]);
+
+ return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)])
+ ^ shared[1][__byte_perm(state[6].y,0,0x44443)]
+ ^ __ldg(&T22[__byte_perm(state[5].y,0,0x44443)])
+ ^ shared[3][__byte_perm(state[4].y,0,0x44443)]
+ ^ shared[4][__byte_perm(state[3].y,0,0x44443)]
+ ^ shared[5][__byte_perm(state[2].y,0,0x44443)]
+ ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)])
+ ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)]);
+}
+
+__device__ __forceinline__
+static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uint2 *const __restrict__ state)
+{
+ uint2 t[8];
+ for(int i=0; i<12; i++){
+ GOST_FS(shared,state, t);
+
+ #pragma unroll 8
+ for(int j=0;j<8;j++)
+ K[ j] ^= *(uint2*)&CC[i][j];
+
+ #pragma unroll 8
+ for(int j=0;j<8;j++)
+ state[ j] = t[ j];
+
+ GOST_FS_LDG(shared,K, t);
+
+ #pragma unroll 8
+ for(int j=0;j<8;j++)
+ state[ j]^= t[ j];
+
+ #pragma unroll 8
+ for(int j=0;j<8;j++)
+ K[ j] = t[ j];
+ }
+}
+
+#define TPB 256
+__global__
+#if __CUDA_ARCH__ > 500
+__launch_bounds__(TPB, 3)
+#else
+__launch_bounds__(TPB, 3)
+#endif
+void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ uint2 buf[8], t[8], temp[8], K0[8], hash[8];
+
+ __shared__ uint2 shared[8][256];
+ shared[0][threadIdx.x] = __ldg(&T02[threadIdx.x]);
+ shared[1][threadIdx.x] = __ldg(&T12[threadIdx.x]);
+ shared[2][threadIdx.x] = __ldg(&T22[threadIdx.x]);
+ shared[3][threadIdx.x] = __ldg(&T32[threadIdx.x]);
+ shared[4][threadIdx.x] = __ldg(&T42[threadIdx.x]);
+ shared[5][threadIdx.x] = __ldg(&T52[threadIdx.x]);
+ shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]);
+ shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]);
+
+ uint64_t* inout = &g_hash[thread<<3];
+
+ *(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
+ *(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);
+
+ __threadfence_block();
+
+ K0[0] = vectorize(0x74a5d4ce2efc83b3);
+
+ #pragma unroll 8
+ for(int i=0;i<8;i++){
+ buf[ i] = K0[ 0] ^ hash[ i];
+ }
+
+ for(int i=0; i<12; i++){
+ GOST_FS(shared, buf, temp);
+ #pragma unroll 8
+ for(uint32_t j=0;j<8;j++){
+ buf[ j] = temp[ j] ^ *(uint2*)&precomputed_values[i][j];
+ }
+ }
+ #pragma unroll 8
+ for(int j=0;j<8;j++){
+ buf[ j]^= hash[ j];
+ }
+ #pragma unroll 8
+ for(int j=0;j<8;j++){
+ K0[ j] = buf[ j];
+ }
+
+ K0[7].y ^= 0x00020000;
+
+ GOST_FS(shared, K0, t);
+
+ #pragma unroll 8
+ for(int i=0;i<8;i++)
+ K0[ i] = t[ i];
+
+ t[7].y ^= 0x01000000;
+
+ GOST_E12(shared, K0, t);
+
+ #pragma unroll 8
+ for(int j=0;j<8;j++)
+ buf[ j] ^= t[ j];
+
+ buf[7].y ^= 0x01000000;
+
+ GOST_FS(shared, buf,K0);
+
+ buf[7].y ^= 0x00020000;
+
+ #pragma unroll 8
+ for(int j=0;j<8;j++)
+ t[ j] = K0[ j];
+
+ t[7].y ^= 0x00020000;
+
+ GOST_E12(shared, K0, t);
+
+ #pragma unroll 8
+ for(int j=0;j<8;j++)
+ buf[ j] ^= t[ j];
+
+ GOST_FS(shared, buf,K0); // K = F(h)
+
+ hash[7]+= vectorize(0x0100000000000000);
+
+ #pragma unroll 8
+ for(int j=0;j<8;j++)
+ t[ j] = K0[ j] ^ hash[ j];
+
+ GOST_E12(shared, K0, t);
+
+ *(uint2x4*)&inout[0] = *(uint2x4*)&t[0] ^ *(uint2x4*)&hash[0] ^ *(uint2x4*)&buf[0];
+ *(uint2x4*)&inout[4] = *(uint2x4*)&t[4] ^ *(uint2x4*)&hash[4] ^ *(uint2x4*)&buf[4];
+}
+
+__host__
+void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash)
+{
+ dim3 grid((threads + TPB-1) / TPB);
+ dim3 block(TPB);
+ streebog_gpu_hash_64_maxwell <<>> ((uint64_t*)d_hash);
+}
diff --git a/x11/phi.cu b/x11/phi.cu
new file mode 100644
index 0000000000..ab1f30833c
--- /dev/null
+++ b/x11/phi.cu
@@ -0,0 +1,223 @@
+//
+//
+// PHI1612 algo
+// Skein + JH + CubeHash + Fugue + Gost + Echo
+//
+// Implemented by anorganix @ bitcointalk on 01.10.2017
+// Feel free to send some satoshis to 1Bitcoin8tfbtGAQNFxDRUVUfFgFWKoWi9
+//
+//
+
+extern "C" {
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_streebog.h"
+#include "sph/sph_echo.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x11.h"
+
+extern void skein512_cpu_setBlock_80(void *pdata);
+extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
+extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+extern void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target);
+
+#include
+#include
+
+static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_resNonce[MAX_GPUS];
+
+extern "C" void phihash(void *output, const void *input)
+{
+ unsigned char _ALIGN(128) hash[128] = { 0 };
+
+ sph_skein512_context ctx_skein;
+ sph_jh512_context ctx_jh;
+ sph_cubehash512_context ctx_cubehash;
+ sph_fugue512_context ctx_fugue;
+ sph_gost512_context ctx_gost;
+ sph_echo512_context ctx_echo;
+
+ sph_skein512_init(&ctx_skein);
+ sph_skein512(&ctx_skein, input, 80);
+ sph_skein512_close(&ctx_skein, (void*)hash);
+
+ sph_jh512_init(&ctx_jh);
+ sph_jh512(&ctx_jh, (const void*)hash, 64);
+ sph_jh512_close(&ctx_jh, (void*)hash);
+
+ sph_cubehash512_init(&ctx_cubehash);
+ sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+ sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+ sph_fugue512_init(&ctx_fugue);
+ sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+ sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+ sph_gost512_init(&ctx_gost);
+ sph_gost512(&ctx_gost, (const void*)hash, 64);
+ sph_gost512_close(&ctx_gost, (void*)hash);
+
+ sph_echo512_init(&ctx_echo);
+ sph_echo512(&ctx_echo, (const void*)hash, 64);
+ sph_echo512_close(&ctx_echo, (void*)hash);
+
+ memcpy(output, hash, 32);
+}
+
+#define _DEBUG_PREFIX "phi"
+#include "cuda_debug.cuh"
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+ uint32_t *pdata = work->data;
+ uint32_t *ptarget = work->target;
+
+ const uint32_t first_nonce = pdata[19];
+ const int dev_id = device_map[thr_id];
+
+ int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 19 : 18; // 2^18 = 262144 cuda threads
+ if (device_sm[dev_id] >= 600) intensity = 20;
+
+ uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+ if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+ if (opt_benchmark)
+ ptarget[7] = 0xf;
+
+ if (!init[thr_id])
+ {
+ cudaSetDevice(dev_id);
+ if (opt_cudaschedule == -1 && gpu_threads == 1) {
+ cudaDeviceReset();
+ cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+
+ CUDA_LOG_ERROR();
+ }
+ gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+ cuda_get_arch(thr_id);
+ use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
+ quark_skein512_cpu_init(thr_id, throughput);
+ quark_jh512_cpu_init(thr_id, throughput);
+ x11_cubehash512_cpu_init(thr_id, throughput);
+ x13_fugue512_cpu_init(thr_id, throughput);
+ if (use_compat_kernels[thr_id])
+ x11_echo512_cpu_init(thr_id, throughput);
+
+ CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput), -1);
+ CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t)));
+
+ cuda_check_cpu_init(thr_id, throughput);
+ init[thr_id] = true;
+ }
+
+ uint32_t endiandata[20];
+
+ for (int k = 0; k < 20; k++)
+ be32enc(&endiandata[k], pdata[k]);
+
+ skein512_cpu_setBlock_80((void*)endiandata);
+ if (use_compat_kernels[thr_id])
+ cuda_check_cpu_setTarget(ptarget);
+ else
+ cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
+
+ do {
+ int order = 0;
+
+ skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+ quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ if (use_compat_kernels[thr_id]) {
+ streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+ x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+ } else {
+ streebog_hash_64_maxwell(thr_id, throughput, d_hash[thr_id]);
+ tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6]));
+ cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+ }
+
+ if (work->nonces[0] != UINT32_MAX)
+ {
+ const uint32_t Htarg = ptarget[7];
+ const uint32_t startNonce = pdata[19];
+ uint32_t _ALIGN(64) vhash[8];
+ if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce;
+ be32enc(&endiandata[19], work->nonces[0]);
+ phihash(vhash, endiandata);
+
+ if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+ work->valid_nonces = 1;
+ work_set_target_ratio(work, vhash);
+ *hashes_done = pdata[19] - first_nonce + throughput;
+ //work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+ //if (work->nonces[1] != 0) {
+ if (work->nonces[1] != UINT32_MAX) {
+ work->nonces[1] += startNonce;
+ be32enc(&endiandata[19], work->nonces[1]);
+ phihash(vhash, endiandata);
+ bn_set_target_ratio(work, vhash, 1);
+ work->valid_nonces++;
+ pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+ }
+ else {
+ pdata[19] = work->nonces[0] + 1; // cursor
+ }
+ return work->valid_nonces;
+ }
+ else if (vhash[7] > Htarg) {
+ gpu_increment_reject(thr_id);
+ if (!opt_quiet)
+ gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+ cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t));
+ pdata[19] = work->nonces[0] + 1;
+ continue;
+ }
+ }
+
+ if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+ pdata[19] = max_nonce;
+ break;
+ }
+ pdata[19] += throughput;
+
+ } while (!work_restart[thr_id].restart);
+
+ *hashes_done = pdata[19] - first_nonce;
+ return 0;
+}
+
+// cleanup
+extern "C" void free_phi(int thr_id)
+{
+ if (!init[thr_id])
+ return;
+
+ cudaThreadSynchronize();
+ cudaFree(d_hash[thr_id]);
+ cudaFree(d_resNonce[thr_id]);
+ x13_fugue512_cpu_free(thr_id);
+
+ cuda_check_cpu_free(thr_id);
+ init[thr_id] = false;
+
+ cudaDeviceSynchronize();
+}
diff --git a/x11/sib.cu b/x11/sib.cu
index 158f85e287..c437523d03 100644
--- a/x11/sib.cu
+++ b/x11/sib.cu
@@ -18,6 +18,7 @@ extern "C" {
#include "cuda_x11.h"
extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+extern void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash);
#include
#include
@@ -98,6 +99,7 @@ extern "C" void sibhash(void *output, const void *input)
#include "cuda_debug.cuh"
static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
@@ -124,6 +126,9 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+ cuda_get_arch(thr_id);
+ use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
quark_blake512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
@@ -166,7 +171,10 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
TRACE("jh512 :");
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("keccak :");
- streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+ if (use_compat_kernels[thr_id])
+ streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+ else
+ streebog_hash_64_maxwell(thr_id, throughput, d_hash[thr_id]);
TRACE("gost :");
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
TRACE("luffa+c:");
diff --git a/skunk/streebog_arrays.cuh b/x11/streebog_arrays.cuh
similarity index 100%
rename from skunk/streebog_arrays.cuh
rename to x11/streebog_arrays.cuh
diff --git a/x11/veltor.cu b/x11/veltor.cu
index be05f5a7d5..7bc1e18dab 100644
--- a/x11/veltor.cu
+++ b/x11/veltor.cu
@@ -10,11 +10,17 @@ extern "C" {
#include "cuda_x11.h"
extern void skein512_cpu_setBlock_80(void *pdata);
-extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
-extern void streebog_set_target(const uint32_t* ptarget);
+
+// for SM3.x
+extern void streebog_sm3_set_target(uint32_t* ptarget);
+extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
+
+// for latest cards only
+extern void skunk_cpu_init(int thr_id, uint32_t threads);
+extern void skunk_streebog_set_target(uint32_t* ptarget);
+extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce);
#include
#include
@@ -23,7 +29,7 @@ extern void streebog_set_target(const uint32_t* ptarget);
static uint32_t *d_hash[MAX_GPUS];
static uint32_t *d_resNonce[MAX_GPUS];
-// veltorcoin CPU Hash
+// veltor CPU Hash
extern "C" void veltorhash(void *output, const void *input)
{
unsigned char _ALIGN(128) hash[128] = { 0 };
@@ -53,6 +59,7 @@ extern "C" void veltorhash(void *output, const void *input)
}
static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
@@ -80,7 +87,9 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
- quark_skein512_cpu_init(thr_id, throughput);
+ skunk_cpu_init(thr_id, throughput);
+ use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+
x11_shavite512_cpu_init(thr_id, throughput);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
@@ -97,14 +106,20 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
skein512_cpu_setBlock_80(endiandata);
cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
- streebog_set_target(ptarget);
+ if(use_compat_kernels[thr_id])
+ streebog_sm3_set_target(ptarget);
+ else
+ skunk_streebog_set_target(ptarget);
do {
int order = 0;
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
- streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+ if(use_compat_kernels[thr_id])
+ streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
+ else
+ skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]);
cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost);
diff --git a/x13/cuda_hsr_sm3.cu b/x13/cuda_hsr_sm3.cu
new file mode 100644
index 0000000000..5ce018626e
--- /dev/null
+++ b/x13/cuda_hsr_sm3.cu
@@ -0,0 +1,139 @@
+#include
+#include
+#include
+
+#include
+#include
+
+#define F(x, y, z) (((x) ^ (y) ^ (z)))
+#define FF(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z)))
+#define GG(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+
+#define P0(x) x ^ ROTL32(x, 9) ^ ROTL32(x, 17)
+#define P1(x) x ^ ROTL32(x, 15) ^ ROTL32(x, 23)
+
+static __forceinline__ __device__
+void sm3_compress2(uint32_t digest[8], const uint32_t pblock[16])
+{
+ uint32_t tt1, tt2, i, t, ss1, ss2, x, y;
+ uint32_t w[68];
+ uint32_t a = digest[0];
+ uint32_t b = digest[1];
+ uint32_t c = digest[2];
+ uint32_t d = digest[3];
+ uint32_t e = digest[4];
+ uint32_t f = digest[5];
+ uint32_t g = digest[6];
+ uint32_t h = digest[7];
+
+ #pragma unroll
+ for (i = 0; i<16; i++) {
+ w[i] = cuda_swab32(pblock[i]);
+ }
+
+ for (i = 16; i<68; i++) {
+ x = ROTL32(w[i - 3], 15);
+ y = ROTL32(w[i - 13], 7);
+
+ x ^= w[i - 16];
+ x ^= w[i - 9];
+ y ^= w[i - 6];
+
+ w[i] = P1(x) ^ y;
+ }
+
+ for (i = 0; i<64; i++) {
+
+ t = (i < 16) ? 0x79cc4519 : 0x7a879d8a;
+
+ ss2 = ROTL32(a, 12);
+ ss1 = ROTL32(ss2 + e + ROTL32(t, i), 7);
+ ss2 ^= ss1;
+
+ tt1 = d + ss2 + (w[i] ^ w[i + 4]);
+ tt2 = h + ss1 + w[i];
+
+ if (i < 16) {
+ tt1 += F(a, b, c);
+ tt2 += F(e, f, g);
+ }
+ else {
+ tt1 += FF(a, b, c);
+ tt2 += GG(e, f, g);
+ }
+ d = c;
+ c = ROTL32(b, 9);
+ b = a;
+ a = tt1;
+ h = g;
+ g = ROTL32(f, 19);
+ f = e;
+ e = P0(tt2);
+ }
+
+ digest[0] ^= a;
+ digest[1] ^= b;
+ digest[2] ^= c;
+ digest[3] ^= d;
+ digest[4] ^= e;
+ digest[5] ^= f;
+ digest[6] ^= g;
+ digest[7] ^= h;
+}
+
+/***************************************************/
+// GPU Hash Function
+__global__
+void sm3_gpu_hash_64(const uint32_t threads, uint32_t *g_hash)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+ if (thread < threads)
+ {
+ const size_t hashPosition = thread;
+
+ uint32_t digest[8];
+ digest[0] = 0x7380166F;
+ digest[1] = 0x4914B2B9;
+ digest[2] = 0x172442D7;
+ digest[3] = 0xDA8A0600;
+ digest[4] = 0xA96F30BC;
+ digest[5] = 0x163138AA;
+ digest[6] = 0xE38DEE4D;
+ digest[7] = 0xB0FB0E4E;
+
+ uint32_t *pHash = &g_hash[hashPosition << 4];
+ sm3_compress2(digest, pHash);
+
+ uint32_t block[16];
+ block[0] = 0x80;
+
+ #pragma unroll
+ for (int i = 1; i < 14; i++)
+ block[i] = 0;
+
+ // count
+ block[14] = cuda_swab32(1 >> 23);
+ block[15] = cuda_swab32((1 << 9) + (0 << 3));
+
+ sm3_compress2(digest, block);
+
+ for (int i = 0; i < 8; i++)
+ pHash[i] = cuda_swab32(digest[i]);
+
+ for (int i = 8; i < 16; i++)
+ pHash[i] = 0;
+ }
+}
+
+__host__
+void sm3_cuda_hash_64(int thr_id, uint32_t threads, uint32_t *g_hash, int order)
+{
+ const uint32_t threadsperblock = 256;
+
+ dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+ dim3 block(threadsperblock);
+
+ sm3_gpu_hash_64 <<>>(threads, g_hash);
+ //MyStreamSynchronize(NULL, order, thr_id);
+}
diff --git a/x13/hsr.cu b/x13/hsr.cu
new file mode 100644
index 0000000000..e86444628d
--- /dev/null
+++ b/x13/hsr.cu
@@ -0,0 +1,265 @@
+/*
+ * X13 algorithm
+ */
+extern "C"
+{
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+}
+#include "sm3.h"
+
+#include "miner.h"
+
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void sm3_cuda_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order);
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+// HSR CPU Hash
+extern "C" void hsr_hash(void *output, const void *input)
+{
+ // blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13
+
+ sph_blake512_context ctx_blake;
+ sph_bmw512_context ctx_bmw;
+ sph_groestl512_context ctx_groestl;
+ sph_jh512_context ctx_jh;
+ sph_keccak512_context ctx_keccak;
+ sph_skein512_context ctx_skein;
+ sph_luffa512_context ctx_luffa;
+ sph_cubehash512_context ctx_cubehash;
+ sph_shavite512_context ctx_shavite;
+ sph_simd512_context ctx_simd;
+ sph_echo512_context ctx_echo;
+ sm3_ctx_t ctx_sm3;
+ sph_hamsi512_context ctx_hamsi;
+ sph_fugue512_context ctx_fugue;
+
+ uint32_t hash[32];
+ memset(hash, 0, sizeof hash);
+
+ sph_blake512_init(&ctx_blake);
+ sph_blake512(&ctx_blake, input, 80);
+ sph_blake512_close(&ctx_blake, (void*) hash);
+
+ sph_bmw512_init(&ctx_bmw);
+ sph_bmw512(&ctx_bmw, (const void*) hash, 64);
+ sph_bmw512_close(&ctx_bmw, (void*) hash);
+
+ sph_groestl512_init(&ctx_groestl);
+ sph_groestl512(&ctx_groestl, (const void*) hash, 64);
+ sph_groestl512_close(&ctx_groestl, (void*) hash);
+
+ sph_skein512_init(&ctx_skein);
+ sph_skein512(&ctx_skein, (const void*) hash, 64);
+ sph_skein512_close(&ctx_skein, (void*) hash);
+
+ sph_jh512_init(&ctx_jh);
+ sph_jh512(&ctx_jh, (const void*) hash, 64);
+ sph_jh512_close(&ctx_jh, (void*) hash);
+
+ sph_keccak512_init(&ctx_keccak);
+ sph_keccak512(&ctx_keccak, (const void*) hash, 64);
+ sph_keccak512_close(&ctx_keccak, (void*) hash);
+
+ sph_luffa512_init(&ctx_luffa);
+ sph_luffa512(&ctx_luffa, (const void*) hash, 64);
+ sph_luffa512_close (&ctx_luffa, (void*) hash);
+
+ sph_cubehash512_init(&ctx_cubehash);
+ sph_cubehash512(&ctx_cubehash, (const void*) hash, 64);
+ sph_cubehash512_close(&ctx_cubehash, (void*) hash);
+
+ sph_shavite512_init(&ctx_shavite);
+ sph_shavite512(&ctx_shavite, (const void*) hash, 64);
+ sph_shavite512_close(&ctx_shavite, (void*) hash);
+
+ sph_simd512_init(&ctx_simd);
+ sph_simd512(&ctx_simd, (const void*) hash, 64);
+ sph_simd512_close(&ctx_simd, (void*) hash);
+
+ sph_echo512_init(&ctx_echo);
+ sph_echo512(&ctx_echo, (const void*) hash, 64);
+ sph_echo512_close(&ctx_echo, (void*) hash);
+
+ sm3_init(&ctx_sm3);
+ sm3_update(&ctx_sm3, (const unsigned char*) hash, 64);
+ memset(hash, 0, sizeof hash);
+ sm3_close(&ctx_sm3, (void*) hash);
+
+ sph_hamsi512_init(&ctx_hamsi);
+ sph_hamsi512(&ctx_hamsi, (const void*) hash, 64);
+ sph_hamsi512_close(&ctx_hamsi, (void*) hash);
+
+ sph_fugue512_init(&ctx_fugue);
+ sph_fugue512(&ctx_fugue, (const void*) hash, 64);
+ sph_fugue512_close(&ctx_fugue, (void*) hash);
+
+ memcpy(output, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_hsr(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+ uint32_t *pdata = work->data;
+ uint32_t *ptarget = work->target;
+ const uint32_t first_nonce = pdata[19];
+ int intensity = 19; // (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19;
+ uint32_t throughput = cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8;
+ //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+ if (opt_benchmark)
+ ((uint32_t*)ptarget)[7] = 0x000f;
+
+ if (!init[thr_id])
+ {
+ cudaSetDevice(device_map[thr_id]);
+ if (opt_cudaschedule == -1 && gpu_threads == 1) {
+ cudaDeviceReset();
+ // reduce cpu usage
+ cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+ CUDA_LOG_ERROR();
+ }
+ gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+ quark_blake512_cpu_init(thr_id, throughput);
+ quark_groestl512_cpu_init(thr_id, throughput);
+ quark_skein512_cpu_init(thr_id, throughput);
+ quark_bmw512_cpu_init(thr_id, throughput);
+ quark_keccak512_cpu_init(thr_id, throughput);
+ quark_jh512_cpu_init(thr_id, throughput);
+ x11_luffaCubehash512_cpu_init(thr_id, throughput);
+ x11_shavite512_cpu_init(thr_id, throughput);
+ if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
+ return 0;
+ }
+ x11_echo512_cpu_init(thr_id, throughput);
+ x13_hamsi512_cpu_init(thr_id, throughput);
+ x13_fugue512_cpu_init(thr_id, throughput);
+
+ CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
+ cuda_check_cpu_init(thr_id, throughput);
+
+ init[thr_id] = true;
+ }
+
+ uint32_t endiandata[20];
+ for (int k=0; k < 20; k++)
+ be32enc(&endiandata[k], pdata[k]);
+
+ quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+ cuda_check_cpu_setTarget(ptarget);
+
+ do {
+ int order = 0;
+
+ quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+ x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ sm3_cuda_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+ x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+ *hashes_done = pdata[19] - first_nonce + throughput;
+
+ CUDA_LOG_ERROR();
+
+ work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+ if (work->nonces[0] != UINT32_MAX)
+ {
+ const uint32_t Htarg = ptarget[7];
+ uint32_t _ALIGN(64) vhash[8];
+ be32enc(&endiandata[19], work->nonces[0]);
+ hsr_hash(vhash, endiandata);
+
+ if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+ work->valid_nonces = 1;
+ work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+ work_set_target_ratio(work, vhash);
+ if (work->nonces[1] != 0) {
+ be32enc(&endiandata[19], work->nonces[1]);
+ hsr_hash(vhash, endiandata);
+ bn_set_target_ratio(work, vhash, 1);
+ work->valid_nonces++;
+ pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+ } else {
+ pdata[19] = work->nonces[0] + 1; // cursor
+ }
+ return work->valid_nonces;
+ }
+ else if (vhash[7] > Htarg) {
+ gpu_increment_reject(thr_id);
+ if (!opt_quiet)
+ gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+ pdata[19] = work->nonces[0] + 1;
+ continue;
+ }
+ }
+
+ if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+ pdata[19] = max_nonce;
+ break;
+ }
+ pdata[19] += throughput;
+
+ } while (!work_restart[thr_id].restart);
+
+ *hashes_done = pdata[19] - first_nonce;
+
+ CUDA_LOG_ERROR();
+
+ return 0;
+}
+
+// cleanup
+extern "C" void free_hsr(int thr_id)
+{
+ if (!init[thr_id])
+ return;
+
+ cudaThreadSynchronize();
+
+ cudaFree(d_hash[thr_id]);
+
+ quark_blake512_cpu_free(thr_id);
+ quark_groestl512_cpu_free(thr_id);
+ x11_simd512_cpu_free(thr_id);
+ x13_fugue512_cpu_free(thr_id);
+
+ cuda_check_cpu_free(thr_id);
+ CUDA_LOG_ERROR();
+
+ cudaDeviceSynchronize();
+ init[thr_id] = false;
+}
diff --git a/x13/sm3.c b/x13/sm3.c
new file mode 100644
index 0000000000..295ba15086
--- /dev/null
+++ b/x13/sm3.c
@@ -0,0 +1,220 @@
+/* ====================================================================
+ * Copyright (c) 2014 - 2017 The GmSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the GmSSL Project.
+ * (http://gmssl.org/)"
+ *
+ * 4. The name "GmSSL Project" must not be used to endorse or promote
+ * products derived from this software without prior written
+ * permission. For written permission, please contact
+ * guanzhi1980@gmail.com.
+ *
+ * 5. Products derived from this software may not be called "GmSSL"
+ * nor may "GmSSL" appear in their names without prior written
+ * permission of the GmSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the GmSSL Project
+ * (http://gmssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#include
+
+#include "sm3.h"
+
+void sm3_init(sm3_ctx_t *ctx)
+{
+ ctx->digest[0] = 0x7380166F;
+ ctx->digest[1] = 0x4914B2B9;
+ ctx->digest[2] = 0x172442D7;
+ ctx->digest[3] = 0xDA8A0600;
+ ctx->digest[4] = 0xA96F30BC;
+ ctx->digest[5] = 0x163138AA;
+ ctx->digest[6] = 0xE38DEE4D;
+ ctx->digest[7] = 0xB0FB0E4E;
+
+ ctx->nblocks = 0;
+ ctx->num = 0;
+}
+
+void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len)
+{
+ if (ctx->num) {
+ unsigned int left = SM3_BLOCK_SIZE - ctx->num;
+ if (data_len < left) {
+ memcpy(ctx->block + ctx->num, data, data_len);
+ ctx->num += data_len;
+ return;
+ } else {
+ memcpy(ctx->block + ctx->num, data, left);
+ sm3_compress(ctx->digest, ctx->block);
+ ctx->nblocks++;
+ data += left;
+ data_len -= left;
+ }
+ }
+ while (data_len >= SM3_BLOCK_SIZE) {
+ sm3_compress(ctx->digest, data);
+ ctx->nblocks++;
+ data += SM3_BLOCK_SIZE;
+ data_len -= SM3_BLOCK_SIZE;
+ }
+ ctx->num = data_len;
+ if (data_len) {
+ memcpy(ctx->block, data, data_len);
+ }
+}
+
+void sm3_close(void *cc, void *dst)
+{
+ sm3_final(cc, dst);
+ memset(cc, 0, sizeof(sm3_ctx_t));
+}
+
+void sm3_final(sm3_ctx_t *ctx, unsigned char *digest)
+{
+ int i;
+ uint32_t *pdigest = (uint32_t *)digest;
+ uint32_t *count = (uint32_t *)(ctx->block + SM3_BLOCK_SIZE - 8);
+
+ ctx->block[ctx->num] = 0x80;
+
+ if (ctx->num + 9 <= SM3_BLOCK_SIZE) {
+ memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 9);
+ } else {
+ memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 1);
+ sm3_compress(ctx->digest, ctx->block);
+ memset(ctx->block, 0, SM3_BLOCK_SIZE - 8);
+ }
+
+ count[0] = cpu_to_be32((ctx->nblocks) >> 23);
+ count[1] = cpu_to_be32((ctx->nblocks << 9) + (ctx->num << 3));
+
+ sm3_compress(ctx->digest, ctx->block);
+ for (i = 0; i < sizeof(ctx->digest)/sizeof(ctx->digest[0]); i++) {
+ pdigest[i] = cpu_to_be32(ctx->digest[i]);
+ }
+}
+
+#define ROTATELEFT(X,n) (((X)<<(n)) | ((X)>>(32-(n))))
+
+#define P0(x) ((x) ^ ROTATELEFT((x),9) ^ ROTATELEFT((x),17))
+#define P1(x) ((x) ^ ROTATELEFT((x),15) ^ ROTATELEFT((x),23))
+
+#define FF0(x,y,z) ( (x) ^ (y) ^ (z))
+#define FF1(x,y,z) (((x) & (y)) | ( (x) & (z)) | ( (y) & (z)))
+
+#define GG0(x,y,z) ( (x) ^ (y) ^ (z))
+#define GG1(x,y,z) (((x) & (y)) | ( (~(x)) & (z)) )
+
+
+void sm3_compress(uint32_t digest[8], const unsigned char block[64])
+{
+ int j;
+ uint32_t W[68], W1[64];
+ const uint32_t *pblock = (const uint32_t *)block;
+
+ uint32_t A = digest[0];
+ uint32_t B = digest[1];
+ uint32_t C = digest[2];
+ uint32_t D = digest[3];
+ uint32_t E = digest[4];
+ uint32_t F = digest[5];
+ uint32_t G = digest[6];
+ uint32_t H = digest[7];
+ uint32_t SS1,SS2,TT1,TT2,T[64];
+
+ for (j = 0; j < 16; j++) {
+ W[j] = cpu_to_be32(pblock[j]);
+ }
+ for (j = 16; j < 68; j++) {
+ W[j] = P1( W[j-16] ^ W[j-9] ^ ROTATELEFT(W[j-3],15)) ^ ROTATELEFT(W[j - 13],7 ) ^ W[j-6];;
+ }
+ for( j = 0; j < 64; j++) {
+ W1[j] = W[j] ^ W[j+4];
+ }
+
+ for(j =0; j < 16; j++) {
+
+ T[j] = 0x79CC4519;
+ SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7);
+ SS2 = SS1 ^ ROTATELEFT(A,12);
+ TT1 = FF0(A,B,C) + D + SS2 + W1[j];
+ TT2 = GG0(E,F,G) + H + SS1 + W[j];
+ D = C;
+ C = ROTATELEFT(B,9);
+ B = A;
+ A = TT1;
+ H = G;
+ G = ROTATELEFT(F,19);
+ F = E;
+ E = P0(TT2);
+ }
+
+ for(j =16; j < 64; j++) {
+
+ T[j] = 0x7A879D8A;
+ SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7);
+ SS2 = SS1 ^ ROTATELEFT(A,12);
+ TT1 = FF1(A,B,C) + D + SS2 + W1[j];
+ TT2 = GG1(E,F,G) + H + SS1 + W[j];
+ D = C;
+ C = ROTATELEFT(B,9);
+ B = A;
+ A = TT1;
+ H = G;
+ G = ROTATELEFT(F,19);
+ F = E;
+ E = P0(TT2);
+ }
+
+ digest[0] ^= A;
+ digest[1] ^= B;
+ digest[2] ^= C;
+ digest[3] ^= D;
+ digest[4] ^= E;
+ digest[5] ^= F;
+ digest[6] ^= G;
+ digest[7] ^= H;
+}
+
+void sm3(const unsigned char *msg, size_t msglen,
+ unsigned char dgst[SM3_DIGEST_LENGTH])
+{
+ sm3_ctx_t ctx;
+
+ sm3_init(&ctx);
+ sm3_update(&ctx, msg, msglen);
+ sm3_final(&ctx, dgst);
+
+ memset(&ctx, 0, sizeof(sm3_ctx_t));
+}
diff --git a/x13/sm3.h b/x13/sm3.h
new file mode 100644
index 0000000000..05c6595d98
--- /dev/null
+++ b/x13/sm3.h
@@ -0,0 +1,109 @@
+/* ====================================================================
+ * Copyright (c) 2014 - 2016 The GmSSL Project. All rights reserved.
+ * Copyright (c) 2017 - YiiMP (cleaned hmac dead stuff)
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the GmSSL Project.
+ * (http://gmssl.org/)"
+ *
+ * 4. The name "GmSSL Project" must not be used to endorse or promote
+ * products derived from this software without prior written
+ * permission. For written permission, please contact
+ * guanzhi1980@gmail.com.
+ *
+ * 5. Products derived from this software may not be called "GmSSL"
+ * nor may "GmSSL" appear in their names without prior written
+ * permission of the GmSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the GmSSL Project
+ * (http://gmssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+#ifndef _SM3_H
+#define _SM3_H
+
+#define SM3_DIGEST_LENGTH 32
+#define SM3_BLOCK_SIZE 64
+#define SM3_CBLOCK (SM3_BLOCK_SIZE)
+#define SM3_HMAC_SIZE (SM3_DIGEST_LENGTH)
+
+
+#include
+#include
+#include
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct {
+ uint32_t digest[8];
+ int nblocks;
+ unsigned char block[64];
+ int num;
+} sm3_ctx_t;
+
+void sm3_init(sm3_ctx_t *ctx);
+void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len);
+void sm3_close(void *cc, void *dst);
+
+void sm3_final(sm3_ctx_t *ctx, unsigned char digest[SM3_DIGEST_LENGTH]);
+void sm3_compress(uint32_t digest[8], const unsigned char block[SM3_BLOCK_SIZE]);
+void sm3(const unsigned char *data, size_t datalen,
+ unsigned char digest[SM3_DIGEST_LENGTH]);
+
+#ifdef CPU_BIGENDIAN
+
+#define cpu_to_be16(v) (v)
+#define cpu_to_be32(v) (v)
+#define be16_to_cpu(v) (v)
+#define be32_to_cpu(v) (v)
+
+#else
+
+#define cpu_to_le16(v) (v)
+#define cpu_to_le32(v) (v)
+#define le16_to_cpu(v) (v)
+#define le32_to_cpu(v) (v)
+
+#define cpu_to_be16(v) (((v)<< 8) | ((v)>>8))
+#define cpu_to_be32(v) (((v)>>24) | (((v)>>8)&0xff00) | (((v)<<8)&0xff0000) | ((v)<<24))
+#define be16_to_cpu(v) cpu_to_be16(v)
+#define be32_to_cpu(v) cpu_to_be32(v)
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif