diff --git a/INSTALL b/INSTALL index 988761c8db..cac32a1b62 100644 --- a/INSTALL +++ b/INSTALL @@ -1,77 +1,97 @@ - -You can use ./build.sh to configure and build with default options. - -It is advised to run ./autogen.sh before ./configure (autoconf and automake -need to be installed on your system for autogen.sh to work) - -./configure has an option named --with-cuda that allows you to specify -where your CUDA 6.5 toolkit is installed (usually /usr/local/cuda, -but some distros may have a different default location) - - -** How to compile on Fedora 25 ** - -Note: You may find an alternative method via rpms : -see https://negativo17.org/nvidia-driver/ and https://negativo17.org/repos/multimedia/ - - -# Step 1: gcc and dependencies -dnf install gcc gcc-c++ autoconf automake -dnf install jansson-devel openssl-devel libcurl-devel zlib-devel - -# Step 2: nvidia drivers (Download common linux drivers from nvidia site) -dnf install kernel-devel -dnf install https://download1.rpmfusion.org/free/fedora/rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm https://download1.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm -dnf check-update -dnf install xorg-x11-drv-nvidia-cuda kmod-nvidia -ln -s libnvidia-ml.so.1 /usr/lib64/libnvidia-ml.so - -# Step 3: CUDA SDK (Download from nvidia the generic ".run" archive) -# --override is required to ignore "too recent" gcc 6.3 -# --silent is required to install only the toolkit (no kmod) -./cuda_8.0.61_375.26_linux.run --toolkit --silent --override -nvcc --version - -# add the nvcc binary path to the system -ln -s /usr/local/cuda-8.0 /usr/local/cuda # (if not already made) -echo 'export PATH=$PATH:/usr/local/cuda/bin' > /etc/profile.d/cuda.sh - -# add the cudart library path to the system -echo /usr/local/cuda/lib64 > /etc/ld.so.conf.d/cuda.conf -ldconfig - -# Step 4: Fix the toolkit incompatibility with gcc 6 - -# You need to build yourself an older GCC/G++ version, i recommend the 5.4 -# see https://gcc.gnu.org/mirrors.html -# Note: this manual method will override the default gcc, it could be better to use a custom toolchain prefix - -wget ftp://ftp.lip6.fr/pub/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2 -dnf install libmpc-devel mpfr-devel gmp-devel -./configure --prefix=/usr/local --enable-languages=c,c++,lto --disable-multilib -make -j 8 && make install -(while this step, you have the time to cook something :p) - -# or, for previous fedora versions, edit the file /usr/local/cuda/include/host_config.h -# and comment/delete the line 121 : #error -- unsupported GNU version! gcc versions later than 5 are not supported! - -./build.sh - -./ccminer -n - - -** How to compile on macOS ** - -# Step 1: download and install CUDA Toolkit 8 or more recent -# https://developer.nvidia.com/cuda-toolkit-archive - -# Step 2: install Homebrew -ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" - -# Step 3: dependencies -brew install pkg-config autoconf automake curl openssl llvm - -./build.sh - -./ccminer -n - + +You can use ./build.sh to configure and build with default options. + +It is advised to run ./autogen.sh before ./configure (autoconf and automake +need to be installed on your system for autogen.sh to work) + +./configure has an option named --with-cuda that allows you to specify +where your CUDA 6.5 toolkit is installed (usually /usr/local/cuda, +but some distros may have a different default location) + + +** How to compile on Ubuntu (16.04 LTS) + +First, install Cuda toolkit and nVidia Driver, and type `nvidia-smi` to check if your card is detected. + +Install dependencies +```sudo apt-get install libcurl4-openssl-dev libssl-dev libjansson-dev automake autotools-dev build-essential``` + +Ubuntu is now shipped with gcc 6 or 7 so please install gcc/g++ 5 and make it the default (required by the cuda toolkit) +``` +sudo apt-get install gcc-5 g++-5 +sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1 +``` + +Then use the helper ./build.sh in ccminer source folder, edit configure.sh and the Makefile.am if required. +``` +./build.sh +./ccminer --version +``` + + +** How to compile on Fedora 25 ** + +Note: You may find an alternative method via rpms : +see https://negativo17.org/nvidia-driver/ and https://negativo17.org/repos/multimedia/ + + +# Step 1: gcc and dependencies +dnf install gcc gcc-c++ autoconf automake +dnf install jansson-devel openssl-devel libcurl-devel zlib-devel + +# Step 2: nvidia drivers (Download common linux drivers from nvidia site) +dnf install kernel-devel +dnf install https://download1.rpmfusion.org/free/fedora/rpmfusion-free-release-$(rpm -E %fedora).noarch.rpm https://download1.rpmfusion.org/nonfree/fedora/rpmfusion-nonfree-release-$(rpm -E %fedora).noarch.rpm +dnf check-update +dnf install xorg-x11-drv-nvidia-cuda kmod-nvidia +ln -s libnvidia-ml.so.1 /usr/lib64/libnvidia-ml.so + +# Step 3: CUDA SDK (Download from nvidia the generic ".run" archive) +# --override is required to ignore "too recent" gcc 6.3 +# --silent is required to install only the toolkit (no kmod) +./cuda_8.0.61_375.26_linux.run --toolkit --silent --override +nvcc --version + +# add the nvcc binary path to the system +ln -s /usr/local/cuda-8.0 /usr/local/cuda # (if not already made) +echo 'export PATH=$PATH:/usr/local/cuda/bin' > /etc/profile.d/cuda.sh + +# add the cudart library path to the system +echo /usr/local/cuda/lib64 > /etc/ld.so.conf.d/cuda.conf +ldconfig + +# Step 4: Fix the toolkit incompatibility with gcc 6 + +# You need to build yourself an older GCC/G++ version, i recommend the 5.4 +# see https://gcc.gnu.org/mirrors.html +# Note: this manual method will override the default gcc, it could be better to use a custom toolchain prefix + +wget ftp://ftp.lip6.fr/pub/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2 +dnf install libmpc-devel mpfr-devel gmp-devel +./configure --prefix=/usr/local --enable-languages=c,c++,lto --disable-multilib +make -j 8 && make install +(while this step, you have the time to cook something :p) + +# or, for previous fedora versions, edit the file /usr/local/cuda/include/host_config.h +# and comment/delete the line 121 : #error -- unsupported GNU version! gcc versions later than 5 are not supported! + +./build.sh + +./ccminer -n + + +** How to compile on macOS ** + +# Step 1: download and install CUDA Toolkit 8 or more recent +# https://developer.nvidia.com/cuda-toolkit-archive + +# Step 2: install Homebrew +ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" + +# Step 3: dependencies +brew install pkg-config autoconf automake curl openssl llvm + +./build.sh + +./ccminer -n + diff --git a/JHA/jha.cu b/JHA/jha.cu index ec7895c10d..94172ff585 100644 --- a/JHA/jha.cu +++ b/JHA/jha.cu @@ -147,6 +147,9 @@ extern "C" int scanhash_jha(int thr_id, struct work *work, uint32_t max_nonce, u CUDA_LOG_ERROR(); } cuda_get_arch(thr_id); + if (cuda_arch[dev_id] >= 500) { + applog(LOG_WARNING, "You are not using the optimal algo, please try -a jackpot"); + } gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); diff --git a/Makefile.am b/Makefile.am index cc7a1698d9..abf3115a1f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -66,14 +66,16 @@ ccminer_SOURCES = elist.h miner.h compat.h \ sph/ripemd.c sph/sph_sha2.c \ lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \ qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \ - tribus.cu \ + tribus/tribus.cu tribus/cuda_echo512_final.cu \ x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \ x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \ + x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \ x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \ x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \ x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ + x11/phi.cu x11/cuda_streebog_maxwell.cu \ x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu # scrypt diff --git a/README.md b/README.md index 001c4ecab5..5bcb50572c 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ -ccminer -======= +# ccminer Based on Christian Buchner's & Christian H.'s CUDA project, no more active on github since 2014. @@ -10,7 +9,7 @@ BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot) A part of the recent algos were originally written by [djm34](https://github.com/djm34) and [alexis78](https://github.com/alexis78) This variant was tested and built on Linux (ubuntu server 14.04, 16.04, Fedora 22 to 25) -It is also built for Windows 7 to 10 with VStudio 2013, to stay compatible with Windows Vista. +It is also built for Windows 7 to 10 with VStudio 2013, to stay compatible with Windows 7 and Vista. Note that the x86 releases are generally faster than x64 ones on Windows, but that tend to change with the recent drivers. @@ -22,9 +21,7 @@ About source code dependencies This project requires some libraries to be built : - OpenSSL (prebuilt for win) - - Curl (prebuilt for win) - - pthreads (prebuilt for win) The tree now contains recent prebuilt openssl and curl .lib for both x86 and x64 platforms (windows). @@ -32,5 +29,8 @@ The tree now contains recent prebuilt openssl and curl .lib for both x86 and x64 To rebuild them, you need to clone this repository and its submodules : git clone https://github.com/peters/curl-for-windows.git compat/curl-for-windows -On Linux, you can use the helper ./build.sh (edit configure.sh and the Makefile.am if required) +Compile on Linux +---------------- + +Please see [INSTALL](https://github.com/tpruvot/ccminer/blob/linux/INSTALL) file or [project Wiki](https://github.com/tpruvot/ccminer/wiki/Compatibility) diff --git a/README.txt b/README.txt index 0fa05f9b22..290ba605c1 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccminer 2.2 (August 2017) "Equihash, tribus and optimized skunk" +ccminer 2.2.2 (Oct. 2017) "phi and hsr algos" --------------------------------------------------------------- *************************************************************** @@ -90,7 +90,8 @@ its command line interface and options. fugue256 use to mine Fuguecoin groestl use to mine Groestlcoin heavy use to mine Heavycoin - jha use to mine JackpotCoin + hsr use to mine Hshare + jackpot use to mine Sweepcoin keccak use to mine Maxcoin lbry use to mine LBRY Credits luffa use to mine Joincoin @@ -102,6 +103,7 @@ its command line interface and options. neoscrypt use to mine FeatherCoin nist5 use to mine TalkCoin penta use to mine Joincoin / Pentablake + phi use to mine LUXCoin quark use to mine Quarkcoin qubit use to mine Qubit scrypt use to mine Scrypt coins @@ -155,6 +157,7 @@ its command line interface and options. -T, --timeout=N network timeout, in seconds (default: 300) -s, --scantime=N upper bound on time spent scanning current work when long polling is unavailable, in seconds (default: 5) + --submit-stale ignore stale job checks, may create more rejected shares -n, --ndevs list cuda devices -N, --statsavg number of samples used to display hashrate (default: 30) --no-gbt disable getblocktemplate support (height check in solo) @@ -277,6 +280,16 @@ so we can more efficiently implement new algorithms using the latest hardware features. >>> RELEASE HISTORY <<< + Oct. 09th 2017 v2.2.2 + Import and clean the hsr algo (x13 + custom hash) + Import and optimise phi algo from LuxCoin repository + Improve sib algo too for maxwell and pascal cards + Small fix to handle more than 9 cards on linux (-d 10+) + Attempt to free equihash memory "properly" + --submit-stale parameter for supernova pool (which change diff too fast) + + Sep. 01st 2017 v2.2.1 + Improve tribus algo on recent cards (up to +10%) Aug. 13th 2017 v2.2 New skunk algo, using the heavy streebog algorithm diff --git a/algos.h b/algos.h index b7dd0f21fa..3c1528b326 100644 --- a/algos.h +++ b/algos.h @@ -22,6 +22,7 @@ enum sha_algos { ALGO_GROESTL, ALGO_HEAVY, /* Heavycoin hash */ ALGO_HMQ1725, + ALGO_HSR, ALGO_KECCAK, ALGO_JACKPOT, ALGO_JHA, @@ -35,6 +36,7 @@ enum sha_algos { ALGO_NEOSCRYPT, ALGO_NIST5, ALGO_PENTABLAKE, + ALGO_PHI, ALGO_QUARK, ALGO_QUBIT, ALGO_SCRYPT, @@ -87,6 +89,7 @@ static const char *algo_names[] = { "groestl", "heavy", "hmq1725", + "hsr", "keccak", "jackpot", "jha", @@ -100,6 +103,7 @@ static const char *algo_names[] = { "neoscrypt", "nist5", "penta", + "phi", "quark", "qubit", "scrypt", @@ -161,12 +165,16 @@ static inline int algo_to_int(char* arg) i = ALGO_LUFFA; else if (!strcasecmp("hmq17", arg)) i = ALGO_HMQ1725; + else if (!strcasecmp("hshare", arg)) + i = ALGO_HSR; //else if (!strcasecmp("jackpot", arg)) // i = ALGO_JHA; else if (!strcasecmp("lyra2re", arg)) i = ALGO_LYRA2; else if (!strcasecmp("lyra2rev2", arg)) i = ALGO_LYRA2v2; + else if (!strcasecmp("phi1612", arg)) + i = ALGO_PHI; else if (!strcasecmp("bitcoin", arg)) i = ALGO_SHA256D; else if (!strcasecmp("sha256", arg)) diff --git a/bench.cpp b/bench.cpp index 8f32c40fd0..b1bb5bc87d 100644 --- a/bench.cpp +++ b/bench.cpp @@ -59,12 +59,14 @@ void algo_free_all(int thr_id) free_cryptonight(thr_id); free_decred(thr_id); free_deep(thr_id); + free_equihash(thr_id); free_keccak256(thr_id); free_fresh(thr_id); free_fugue256(thr_id); free_groestlcoin(thr_id); free_heavy(thr_id); free_hmq17(thr_id); + free_hsr(thr_id); free_jackpot(thr_id); free_jha(thr_id); free_lbry(thr_id); @@ -76,6 +78,7 @@ void algo_free_all(int thr_id) free_neoscrypt(thr_id); free_nist5(thr_id); free_pentablake(thr_id); + free_phi(thr_id); free_quark(thr_id); free_qubit(thr_id); free_skeincoin(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index dbab501ac1..a39e78cd0f 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -98,7 +98,7 @@ bool allow_gbt = true; bool allow_mininginfo = true; bool check_dups = true; //false; bool check_stratum_jobs = false; - +bool opt_submit_stale = false; bool submit_old = false; bool use_syslog = false; bool use_colors = true; @@ -263,6 +263,7 @@ Options:\n\ groestl Groestlcoin\n\ heavy Heavycoin\n\ hmq1725 Doubloons / Espers\n\ + jackpot JHA v8\n\ jha JHA v8 (JackpotCoin)\n\ keccak Keccak-256 (Maxcoin)\n\ lbry LBRY Credits (Sha/Ripemd)\n\ @@ -275,6 +276,7 @@ Options:\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\ nist5 NIST5 (TalkCoin)\n\ penta Pentablake hash (5x Blake 512)\n\ + phi BHCoin\n\ quark Quark\n\ qubit Qubit\n\ sha256d SHA256d (bitcoin)\n\ @@ -288,7 +290,7 @@ Options:\n\ skunk Skein Cube Fugue Streebog\n\ s3 S3 (1Coin)\n\ timetravel Machinecoin permuted x8\n\ - tribus Denerius\n\ + tribus Denarius\n\ vanilla Blake256-8 (VNL)\n\ veltor Thorsriddle streebog\n\ whirlcoin Old Whirlcoin (Whirlpool algo)\n\ @@ -315,7 +317,7 @@ Options:\n\ --cuda-schedule Set device threads scheduling mode (default: auto)\n\ -f, --diff-factor Divide difficulty by this factor (default 1.0) \n\ -m, --diff-multiplier Multiply difficulty by this value (default 1.0) \n\ - --vote=VOTE vote (for HeavyCoin)\n\ + --vote=VOTE vote (for decred and HeavyCoin)\n\ --trust-pool trust the max block reward vote (maxvote) sent by the pool\n\ -o, --url=URL URL of mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ @@ -331,7 +333,8 @@ Options:\n\ --time-limit maximum time [s] to mine before exiting the program.\n\ -T, --timeout=N network timeout, in seconds (default: 300)\n\ -s, --scantime=N upper bound on time spent scanning current work when\n\ - long polling is unavailable, in seconds (default: 10)\n" + long polling is unavailable, in seconds (default: 10)\n\ + --submit-stale ignore stale jobs checks, may create more rejected shares\n\" #ifndef ORG "\ --segwit Agree with Segwit (Solo Mining only)\n" @@ -455,7 +458,8 @@ struct option options[] = { { "retries", 1, NULL, 'r' }, { "retry-pause", 1, NULL, 'R' }, { "scantime", 1, NULL, 's' }, - { "show-diff", 0, NULL, 1013 }, + { "show-diff", 0, NULL, 1013 }, // deprecated + { "submit-stale", 0, NULL, 1015 }, { "hide-diff", 0, NULL, 1014 }, { "statsavg", 1, NULL, 'N' }, { "gpu-clock", 1, NULL, 1070 }, @@ -927,7 +931,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work) /* discard if a newer block was received */ stale_work = work->height && work->height < g_work.height; - if (have_stratum && !stale_work && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) { + if (have_stratum && !stale_work && !opt_submit_stale && opt_algo != ALGO_ZR5 && opt_algo != ALGO_SCRYPT_JANE) { pthread_mutex_lock(&g_work_lock); if (strlen(work->job_id + 8)) stale_work = strncmp(work->job_id + 8, g_work.job_id + 8, sizeof(g_work.job_id) - 8); @@ -1049,7 +1053,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work) applog(LOG_DEBUG, "share diff: %.5f (x %.1f)", stratum.sharediff, work->shareratio[idnonce]); - if (opt_vote) { // ALGO_HEAVY + if (opt_vote) { // ALGO_HEAVY ALGO_DECRED nvotestr = bin2hex((const uchar*)(&nvote), 2); sprintf(s, "{\"method\": \"mining.submit\", \"params\": [" "\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":%u}", @@ -2684,7 +2688,9 @@ static void *miner_thread(void *userdata) case ALGO_HEAVY: case ALGO_JACKPOT: case ALGO_JHA: + case ALGO_HSR: case ALGO_LYRA2v2: + case ALGO_PHI: case ALGO_S3: case ALGO_SKUNK: case ALGO_TIMETRAVEL: @@ -2789,6 +2795,8 @@ static void *miner_thread(void *userdata) rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done); break; case ALGO_DECRED: + //applog(LOG_BLUE, "version %x, nbits %x, ntime %x extra %x", + // work.data[0], work.data[29], work.data[34], work.data[38]); rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done); break; case ALGO_DEEP: @@ -2815,6 +2823,9 @@ static void *miner_thread(void *userdata) case ALGO_HMQ1725: rc = scanhash_hmq17(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_HSR: + rc = scanhash_hsr(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_HEAVY: rc = scanhash_heavy(thr_id, &work, max_nonce, &hashes_done, work.maxvote, HEAVYCOIN_BLKHDR_SZ); @@ -2864,6 +2875,9 @@ static void *miner_thread(void *userdata) case ALGO_PENTABLAKE: rc = scanhash_pentablake(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_PHI: + rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_SCRYPT: rc = scanhash_scrypt(thr_id, &work, max_nonce, &hashes_done, NULL, &tv_start, &tv_end); @@ -4118,6 +4132,9 @@ void parse_arg(int key, char *arg) case 1014: opt_showdiff = false; break; + case 1015: + opt_submit_stale = true; + break; case 'S': case 1018: applog(LOG_INFO, "Now logging to syslog..."); @@ -4186,10 +4203,10 @@ void parse_arg(int key, char *arg) { int device_thr[MAX_GPUS] = { 0 }; int ngpus = cuda_num_devices(); - char * pch = strtok (arg,","); + char* pch = strtok(arg,","); opt_n_threads = 0; while (pch != NULL && opt_n_threads < MAX_GPUS) { - if (pch[0] >= '0' && pch[0] <= '9' && pch[1] == '\0') + if (pch[0] >= '0' && pch[0] <= '9' && strlen(pch) <= 2) { if (atoi(pch) < ngpus) device_map[opt_n_threads++] = atoi(pch); @@ -4440,7 +4457,7 @@ int main(int argc, char *argv[]) #endif CUDART_VERSION/1000, (CUDART_VERSION % 1000)/10, arch); printf(" Originally based on Christian Buchner and Christian H. project\n"); - printf(" Include some algos from alexis78, djm34, sp, tsiv and klausT.\n\n"); + printf(" Include some kernels from alexis78, djm34, djEzo, tsiv and krnlx.\n\n"); printf("BTC donation address: 1AJdfCpLWPNoAMDfHF1wD5y8VgKSSTHxPo (tpruvot)\n\n"); } diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 01d1aff5f8..f242c59d6f 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -331,6 +331,7 @@ + @@ -539,7 +540,11 @@ 64 - + + + compute_50,sm_50;compute_52,sm_52 + + @@ -556,8 +561,10 @@ 64 + + @@ -571,6 +578,8 @@ + + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 37372b42ab..87e18010bc 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -109,6 +109,9 @@ {031afae7-2a78-4e32-9738-4b589b6f7ff3} + + {1e548d79-c217-4203-989a-a592fe2b2de3} + @@ -222,6 +225,9 @@ Source Files\sph + + Source Files\sph + Source Files @@ -721,6 +727,12 @@ Source Files\CUDA\x13 + + Source Files\CUDA\x13 + + + Source Files\CUDA\x13 + Source Files\CUDA @@ -763,8 +775,17 @@ Source Files\CUDA\skunk - - Source Files\CUDA + + Source Files\CUDA\tribus + + + Source Files\CUDA\tribus + + + Source Files\CUDA\tribus + + + Source Files\CUDA\x11 Source Files\CUDA\x11 @@ -772,6 +793,9 @@ Source Files\CUDA\x11 + + Source Files\CUDA\x11 + Source Files\CUDA\x11 diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h index d07e736d21..7c280727b7 100644 --- a/compat/ccminer-config.h +++ b/compat/ccminer-config.h @@ -1,188 +1,188 @@ -/* CONFIG ONLY FOR MS VC++ BUILD */ - -/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP - systems. This function is required for `alloca.c' support on those systems. - */ -/* #undef CRAY_STACKSEG_END */ - -/* Define to 1 if using `alloca.c'. */ -/* #undef C_ALLOCA */ - -/* Define to 1 if you have `alloca', as a function or macro. */ -#define HAVE_ALLOCA 1 - -/* Define to 1 if you have and it should be used (not on Ultrix). - */ -#define HAVE_ALLOCA_H 1 - -/* Define to 1 if you have the declaration of `be32dec', and to 0 if you - don't. */ -#define HAVE_DECL_BE32DEC 0 - -/* Define to 1 if you have the declaration of `be32enc', and to 0 if you - don't. */ -#define HAVE_DECL_BE32ENC 0 - -/* Define to 1 if you have the declaration of `le32dec', and to 0 if you - don't. */ -#define HAVE_DECL_LE32DEC 0 - -/* Define to 1 if you have the declaration of `le32enc', and to 0 if you - don't. */ -#define HAVE_DECL_LE32ENC 0 - -/* Define to 1 if you have the `getopt_long' function. */ -#define HAVE_GETOPT_LONG 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_INTTYPES_H 1 - -/* Define to 1 if you have the `crypto' library (-lcrypto). */ -#define HAVE_LIBCRYPTO 1 - -/* Define to 1 if you have a functional curl library. */ -#define HAVE_LIBCURL 1 - -/* Define to 1 if you have the `ssl' library (-lssl). */ -#define HAVE_LIBSSL 1 - -/* Define to 1 if you have the `z' library (-lz). */ -#define HAVE_LIBZ 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_MEMORY_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STDINT_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STDLIB_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STRINGS_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_STRING_H 1 - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_SYSLOG_H */ - -/* Define to 1 if you have the header file. */ -/* #undef HAVE_SYS_ENDIAN_H */ - -/* Define to 1 if you have the header file. */ -#define HAVE_SYS_PARAM_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_SYS_STAT_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_SYS_SYSCTL_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_SYS_TYPES_H 1 - -/* Define to 1 if you have the header file. */ -#define HAVE_UNISTD_H 1 - -/* Defined if libcurl supports AsynchDNS */ -/* #undef LIBCURL_FEATURE_ASYNCHDNS */ - -/* Defined if libcurl supports IDN */ -#define LIBCURL_FEATURE_IDN 1 - -/* Defined if libcurl supports IPv6 */ -#define LIBCURL_FEATURE_IPV6 1 - -/* Defined if libcurl supports KRB4 */ -/* #undef LIBCURL_FEATURE_KRB4 */ - -/* Defined if libcurl supports libz */ -#define LIBCURL_FEATURE_LIBZ 1 - -/* Defined if libcurl supports NTLM */ -#define LIBCURL_FEATURE_NTLM 1 - -/* Defined if libcurl supports SSL */ -#define LIBCURL_FEATURE_SSL 1 - -/* Defined if libcurl supports SSPI */ -/* #undef LIBCURL_FEATURE_SSPI */ - -/* Defined if libcurl supports DICT */ -/* #undef LIBCURL_PROTOCOL_DICT */ - -/* Defined if libcurl supports FILE */ -#define LIBCURL_PROTOCOL_FILE 1 - -/* Defined if libcurl supports FTP */ -#define LIBCURL_PROTOCOL_FTP 1 - -/* Defined if libcurl supports FTPS */ -#define LIBCURL_PROTOCOL_FTPS 1 - -/* Defined if libcurl supports HTTP */ -#define LIBCURL_PROTOCOL_HTTP 1 - -/* Defined if libcurl supports HTTPS */ -#define LIBCURL_PROTOCOL_HTTPS 1 - -/* Defined if libcurl supports IMAP */ -/* #undef LIBCURL_PROTOCOL_IMAP */ - -/* Defined if libcurl supports LDAP */ -/* #undef LIBCURL_PROTOCOL_LDAP */ - -/* Defined if libcurl supports POP3 */ -/* #undef LIBCURL_PROTOCOL_POP3 */ - -/* Defined if libcurl supports RTSP */ -/* #undef LIBCURL_PROTOCOL_RTSP */ - -/* Defined if libcurl supports SMTP */ -/* #undef LIBCURL_PROTOCOL_SMTP */ - -/* Defined if libcurl supports TELNET */ -/* #undef LIBCURL_PROTOCOL_TELNET */ - -/* Defined if libcurl supports TFTP */ -/* #undef LIBCURL_PROTOCOL_TFTP */ - -/* Define to 1 if your C compiler doesn't accept -c and -o together. */ -/* #undef NO_MINUS_C_MINUS_O */ - -/* Name of package */ -#define PACKAGE "ccminer" - -/* Define to the address where bug reports for this package should be sent. */ -#define PACKAGE_BUGREPORT "" - -/* Define to the full name of this package. */ -#define PACKAGE_NAME "ccminer" - -/* Define to the home page for this package. */ -#define PACKAGE_URL "http://github.com/tpruvot/ccminer" - -/* Define to the version of this package. */ -#define PACKAGE_VERSION "2.2" - -/* If using the C implementation of alloca, define if you know the - direction of stack growth for your system; otherwise it will be - automatically deduced at runtime. - STACK_DIRECTION > 0 => grows toward higher addresses - STACK_DIRECTION < 0 => grows toward lower addresses - STACK_DIRECTION = 0 => direction of growth unknown */ -/* #undef STACK_DIRECTION */ - -/* Define to 1 if you have the ANSI C header files. */ -#define STDC_HEADERS 1 - -/* Define curl_free() as free() if our version of curl lacks curl_free. */ -/* #undef curl_free */ - -/* Define to `unsigned int' if does not define. */ -//#define size_t unsigned int - -#if !defined(HAVE_STRUCT_TIMESPEC) && _MSC_VER >= 1900 -#define HAVE_STRUCT_TIMESPEC -#endif +/* CONFIG ONLY FOR MS VC++ BUILD */ + +/* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP + systems. This function is required for `alloca.c' support on those systems. + */ +/* #undef CRAY_STACKSEG_END */ + +/* Define to 1 if using `alloca.c'. */ +/* #undef C_ALLOCA */ + +/* Define to 1 if you have `alloca', as a function or macro. */ +#define HAVE_ALLOCA 1 + +/* Define to 1 if you have and it should be used (not on Ultrix). + */ +#define HAVE_ALLOCA_H 1 + +/* Define to 1 if you have the declaration of `be32dec', and to 0 if you + don't. */ +#define HAVE_DECL_BE32DEC 0 + +/* Define to 1 if you have the declaration of `be32enc', and to 0 if you + don't. */ +#define HAVE_DECL_BE32ENC 0 + +/* Define to 1 if you have the declaration of `le32dec', and to 0 if you + don't. */ +#define HAVE_DECL_LE32DEC 0 + +/* Define to 1 if you have the declaration of `le32enc', and to 0 if you + don't. */ +#define HAVE_DECL_LE32ENC 0 + +/* Define to 1 if you have the `getopt_long' function. */ +#define HAVE_GETOPT_LONG 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_INTTYPES_H 1 + +/* Define to 1 if you have the `crypto' library (-lcrypto). */ +#define HAVE_LIBCRYPTO 1 + +/* Define to 1 if you have a functional curl library. */ +#define HAVE_LIBCURL 1 + +/* Define to 1 if you have the `ssl' library (-lssl). */ +#define HAVE_LIBSSL 1 + +/* Define to 1 if you have the `z' library (-lz). */ +#define HAVE_LIBZ 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_MEMORY_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STDLIB_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRINGS_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_STRING_H 1 + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYSLOG_H */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_SYS_ENDIAN_H */ + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_PARAM_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_STAT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_SYSCTL_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Defined if libcurl supports AsynchDNS */ +/* #undef LIBCURL_FEATURE_ASYNCHDNS */ + +/* Defined if libcurl supports IDN */ +#define LIBCURL_FEATURE_IDN 1 + +/* Defined if libcurl supports IPv6 */ +#define LIBCURL_FEATURE_IPV6 1 + +/* Defined if libcurl supports KRB4 */ +/* #undef LIBCURL_FEATURE_KRB4 */ + +/* Defined if libcurl supports libz */ +#define LIBCURL_FEATURE_LIBZ 1 + +/* Defined if libcurl supports NTLM */ +#define LIBCURL_FEATURE_NTLM 1 + +/* Defined if libcurl supports SSL */ +#define LIBCURL_FEATURE_SSL 1 + +/* Defined if libcurl supports SSPI */ +/* #undef LIBCURL_FEATURE_SSPI */ + +/* Defined if libcurl supports DICT */ +/* #undef LIBCURL_PROTOCOL_DICT */ + +/* Defined if libcurl supports FILE */ +#define LIBCURL_PROTOCOL_FILE 1 + +/* Defined if libcurl supports FTP */ +#define LIBCURL_PROTOCOL_FTP 1 + +/* Defined if libcurl supports FTPS */ +#define LIBCURL_PROTOCOL_FTPS 1 + +/* Defined if libcurl supports HTTP */ +#define LIBCURL_PROTOCOL_HTTP 1 + +/* Defined if libcurl supports HTTPS */ +#define LIBCURL_PROTOCOL_HTTPS 1 + +/* Defined if libcurl supports IMAP */ +/* #undef LIBCURL_PROTOCOL_IMAP */ + +/* Defined if libcurl supports LDAP */ +/* #undef LIBCURL_PROTOCOL_LDAP */ + +/* Defined if libcurl supports POP3 */ +/* #undef LIBCURL_PROTOCOL_POP3 */ + +/* Defined if libcurl supports RTSP */ +/* #undef LIBCURL_PROTOCOL_RTSP */ + +/* Defined if libcurl supports SMTP */ +/* #undef LIBCURL_PROTOCOL_SMTP */ + +/* Defined if libcurl supports TELNET */ +/* #undef LIBCURL_PROTOCOL_TELNET */ + +/* Defined if libcurl supports TFTP */ +/* #undef LIBCURL_PROTOCOL_TFTP */ + +/* Define to 1 if your C compiler doesn't accept -c and -o together. */ +/* #undef NO_MINUS_C_MINUS_O */ + +/* Name of package */ +#define PACKAGE "ccminer" + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "ccminer" + +/* Define to the home page for this package. */ +#define PACKAGE_URL "http://github.com/tpruvot/ccminer" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "2.2.2" + +/* If using the C implementation of alloca, define if you know the + direction of stack growth for your system; otherwise it will be + automatically deduced at runtime. + STACK_DIRECTION > 0 => grows toward higher addresses + STACK_DIRECTION < 0 => grows toward lower addresses + STACK_DIRECTION = 0 => direction of growth unknown */ +/* #undef STACK_DIRECTION */ + +/* Define to 1 if you have the ANSI C header files. */ +#define STDC_HEADERS 1 + +/* Define curl_free() as free() if our version of curl lacks curl_free. */ +/* #undef curl_free */ + +/* Define to `unsigned int' if does not define. */ +//#define size_t unsigned int + +#if !defined(HAVE_STRUCT_TIMESPEC) && _MSC_VER >= 1900 +#define HAVE_STRUCT_TIMESPEC +#endif diff --git a/configure.ac b/configure.ac index c369201a86..061dfdd70e 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2.2], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [2.2.2], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/crypto/xmr-rpc.cpp b/crypto/xmr-rpc.cpp index e50b28a96e..82b7845f27 100644 --- a/crypto/xmr-rpc.cpp +++ b/crypto/xmr-rpc.cpp @@ -30,6 +30,10 @@ #define MADV_HUGEPAGE 0 #endif +#ifndef MADV_HUGEPAGE +#define MADV_HUGEPAGE 0 +#endif + #ifndef PRIu64 #define PRIu64 "I64u" #endif diff --git a/equi/cuda_equi.cu b/equi/cuda_equi.cu index 6fc864d307..b7bcbb5910 100644 --- a/equi/cuda_equi.cu +++ b/equi/cuda_equi.cu @@ -2081,7 +2081,7 @@ __host__ void eq_cuda_context::solve(const char *t // destructor template __host__ -eq_cuda_context::~eq_cuda_context() +void eq_cuda_context::freemem() { if (solutions) free(solutions); @@ -2104,6 +2104,12 @@ eq_cuda_context::~eq_cuda_context() } } +template +__host__ +eq_cuda_context::~eq_cuda_context() +{ + freemem(); +} #ifdef CONFIG_MODE_1 template class eq_cuda_context; diff --git a/equi/eqcuda.hpp b/equi/eqcuda.hpp index 68bdaf0265..7cb10c9630 100644 --- a/equi/eqcuda.hpp +++ b/equi/eqcuda.hpp @@ -90,7 +90,7 @@ template struct equi; struct eq_cuda_context_interface { - virtual ~eq_cuda_context_interface(); + //virtual ~eq_cuda_context_interface(); virtual void solve(const char *tequihash_header, unsigned int tequihash_header_len, @@ -99,6 +99,7 @@ struct eq_cuda_context_interface fn_cancel cancelf, fn_solution solutionf, fn_hashdone hashdonef); + public: int thread_id; int device_id; @@ -125,9 +126,9 @@ class eq_cuda_context : public eq_cuda_context_interface fn_cancel cancelf, fn_solution solutionf, fn_hashdone hashdonef); - public: eq_cuda_context(int thr_id, int dev_id); + void freemem(); ~eq_cuda_context(); }; diff --git a/equi/equihash.cpp b/equi/equihash.cpp index 2a6e5141fe..c9ac1fcf30 100644 --- a/equi/equihash.cpp +++ b/equi/equihash.cpp @@ -281,7 +281,11 @@ void free_equihash(int thr_id) if (!init[thr_id]) return; - delete(solvers[thr_id]); + // assume config 1 was used... interface destructor seems bad + eq_cuda_context* ptr = dynamic_cast*>(solvers[thr_id]); + ptr->freemem(); + ptr = NULL; + solvers[thr_id] = NULL; init[thr_id] = false; @@ -291,4 +295,3 @@ void free_equihash(int thr_id) void eq_cuda_context_interface::solve(const char *tequihash_header, unsigned int tequihash_header_len, const char* nonce, unsigned int nonce_len, fn_cancel cancelf, fn_solution solutionf, fn_hashdone hashdonef) { } -eq_cuda_context_interface::~eq_cuda_context_interface() { } diff --git a/miner.h b/miner.h index 382016a547..be5e34bf87 100644 --- a/miner.h +++ b/miner.h @@ -289,6 +289,7 @@ extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, extern int scanhash_groestlcoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_hmq17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_heavy(int thr_id,struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen); +extern int scanhash_hsr(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_jha(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_jackpot(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); // quark method extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); @@ -300,6 +301,7 @@ extern int scanhash_myriad(int thr_id, struct work* work, uint32_t max_nonce, un extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sha256d(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); @@ -350,6 +352,7 @@ extern void free_fugue256(int thr_id); extern void free_groestlcoin(int thr_id); extern void free_heavy(int thr_id); extern void free_hmq17(int thr_id); +extern void free_hsr(int thr_id); extern void free_jackpot(int thr_id); extern void free_jha(int thr_id); extern void free_lbry(int thr_id); @@ -361,6 +364,7 @@ extern void free_myriad(int thr_id); extern void free_neoscrypt(int thr_id); extern void free_nist5(int thr_id); extern void free_pentablake(int thr_id); +extern void free_phi(int thr_id); extern void free_quark(int thr_id); extern void free_qubit(int thr_id); extern void free_sha256d(int thr_id); @@ -897,6 +901,7 @@ void fresh_hash(void *state, const void *input); void fugue256_hash(unsigned char* output, const unsigned char* input, int len); void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); void hmq17hash(void *output, const void *input); +void hsr_hash(void *output, const void *input); void keccak256_hash(void *state, const void *input); void jackpothash(void *state, const void *input); void groestlhash(void *state, const void *input); @@ -909,6 +914,7 @@ void myriadhash(void *state, const void *input); void neoscrypt(uchar *output, const uchar *input, uint32_t profile); void nist5hash(void *state, const void *input); void pentablakehash(void *output, const void *input); +void phihash(void *output, const void *input); void quarkhash(void *state, const void *input); void qubithash(void *state, const void *input); void scrypthash(void* output, const void* input); diff --git a/res/ccminer.rc b/res/ccminer.rc index d7cf1c2f5e..84be50d618 100644 --- a/res/ccminer.rc +++ b/res/ccminer.rc @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico" // VS_VERSION_INFO VERSIONINFO - FILEVERSION 2,2,0,0 - PRODUCTVERSION 2,2,0,0 + FILEVERSION 2,2,2,0 + PRODUCTVERSION 2,2,2,0 FILEFLAGSMASK 0x3fL #ifdef _DEBUG FILEFLAGS 0x21L @@ -76,10 +76,10 @@ BEGIN BEGIN BLOCK "040904e4" BEGIN - VALUE "FileVersion", "2.2" + VALUE "FileVersion", "2.2.2" VALUE "LegalCopyright", "Copyright (C) 2017" VALUE "ProductName", "ccminer" - VALUE "ProductVersion", "2.2" + VALUE "ProductVersion", "2.2.2" END END BLOCK "VarFileInfo" diff --git a/skunk/cuda_skunk_streebog.cu b/skunk/cuda_skunk_streebog.cu index c38de11d46..36ec7923c0 100644 --- a/skunk/cuda_skunk_streebog.cu +++ b/skunk/cuda_skunk_streebog.cu @@ -18,7 +18,7 @@ #include #include -#include "skunk/streebog_arrays.cuh" +#include "x11/streebog_arrays.cuh" //#define FULL_UNROLL __device__ __forceinline__ @@ -204,7 +204,7 @@ static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uin __constant__ uint64_t target64[4]; __host__ -void skunk_set_target(uint32_t* ptarget) +void skunk_streebog_set_target(uint32_t* ptarget) { cudaMemcpyToSymbol(target64, ptarget, 4*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); } diff --git a/skunk/skunk.cu b/skunk/skunk.cu index f89c5fb458..c1add50303 100644 --- a/skunk/skunk.cu +++ b/skunk/skunk.cu @@ -23,12 +23,12 @@ extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t s extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_fugue512_cpu_free(int thr_id); -extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); -extern void streebog_set_target(const uint32_t* ptarget); +extern void streebog_sm3_set_target(uint32_t* ptarget); +extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); // krnlx merged kernel (for high-end cards only) extern void skunk_cpu_init(int thr_id, uint32_t threads); -extern void skunk_set_target(uint32_t* ptarget); +extern void skunk_streebog_set_target(uint32_t* ptarget); extern void skunk_setBlock_80(int thr_id, void *pdata); extern void skunk_cuda_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash); extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); @@ -117,10 +117,10 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce, cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)); if (use_compat_kernels[thr_id]) { skein512_cpu_setBlock_80(endiandata); - streebog_set_target(ptarget); + streebog_sm3_set_target(ptarget); } else { skunk_setBlock_80(thr_id, endiandata); - skunk_set_target(ptarget); + skunk_streebog_set_target(ptarget); } do { @@ -129,7 +129,7 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce, skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); + streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); } else { skunk_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); diff --git a/tribus/cuda_echo512_aes.cuh b/tribus/cuda_echo512_aes.cuh new file mode 100644 index 0000000000..ff205aaeb7 --- /dev/null +++ b/tribus/cuda_echo512_aes.cuh @@ -0,0 +1,318 @@ +#ifdef __INTELLISENSE__ +#define __byte_perm(x, y, b) x +#define __CUDA_ARCH__ 520 +#include +#endif + +#undef ROL8 +#undef ROR8 +#undef ROL16 + +#ifdef __CUDA_ARCH__ +__device__ __forceinline__ +uint32_t ROL8(const uint32_t a) { + return __byte_perm(a, 0, 0x2103); +} +__device__ __forceinline__ +uint32_t ROR8(const uint32_t a) { + return __byte_perm(a, 0, 0x0321); +} +__device__ __forceinline__ +uint32_t ROL16(const uint32_t a) { + return __byte_perm(a, 0, 0x1032); +} +#else +#define ROL8(u) ROTL32(u, 8) +#define ROR8(u) ROTR32(u, 8) +#define ROL16(u) ROTL32(u,16) +#endif + +__device__ uint32_t d_AES0[256] = { + 0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591, + 0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC, + 0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB, + 0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B, + 0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83, + 0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A, + 0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F, + 0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA, + 0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B, + 0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413, + 0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6, + 0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85, + 0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511, + 0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B, + 0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1, + 0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF, + 0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E, + 0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6, + 0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B, + 0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD, + 0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8, + 0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2, + 0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949, + 0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810, + 0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697, + 0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F, + 0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C, + 0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27, + 0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433, + 0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5, + 0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0, + 0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C +}; + +__device__ uint32_t d_AES3[256] = { + 0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5, + 0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B, 0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676, + 0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D, 0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0, + 0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF, 0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0, + 0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626, 0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC, + 0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1, 0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515, + 0x080C0404, 0x9552C7C7, 0x46652323, 0x9D5EC3C3, 0x30281818, 0x37A19696, 0x0A0F0505, 0x2FB59A9A, + 0x0E090707, 0x24361212, 0x1B9B8080, 0xDF3DE2E2, 0xCD26EBEB, 0x4E692727, 0x7FCDB2B2, 0xEA9F7575, + 0x121B0909, 0x1D9E8383, 0x58742C2C, 0x342E1A1A, 0x362D1B1B, 0xDCB26E6E, 0xB4EE5A5A, 0x5BFBA0A0, + 0xA4F65252, 0x764D3B3B, 0xB761D6D6, 0x7DCEB3B3, 0x527B2929, 0xDD3EE3E3, 0x5E712F2F, 0x13978484, + 0xA6F55353, 0xB968D1D1, 0x00000000, 0xC12CEDED, 0x40602020, 0xE31FFCFC, 0x79C8B1B1, 0xB6ED5B5B, + 0xD4BE6A6A, 0x8D46CBCB, 0x67D9BEBE, 0x724B3939, 0x94DE4A4A, 0x98D44C4C, 0xB0E85858, 0x854ACFCF, + 0xBB6BD0D0, 0xC52AEFEF, 0x4FE5AAAA, 0xED16FBFB, 0x86C54343, 0x9AD74D4D, 0x66553333, 0x11948585, + 0x8ACF4545, 0xE910F9F9, 0x04060202, 0xFE817F7F, 0xA0F05050, 0x78443C3C, 0x25BA9F9F, 0x4BE3A8A8, + 0xA2F35151, 0x5DFEA3A3, 0x80C04040, 0x058A8F8F, 0x3FAD9292, 0x21BC9D9D, 0x70483838, 0xF104F5F5, + 0x63DFBCBC, 0x77C1B6B6, 0xAF75DADA, 0x42632121, 0x20301010, 0xE51AFFFF, 0xFD0EF3F3, 0xBF6DD2D2, + 0x814CCDCD, 0x18140C0C, 0x26351313, 0xC32FECEC, 0xBEE15F5F, 0x35A29797, 0x88CC4444, 0x2E391717, + 0x9357C4C4, 0x55F2A7A7, 0xFC827E7E, 0x7A473D3D, 0xC8AC6464, 0xBAE75D5D, 0x322B1919, 0xE6957373, + 0xC0A06060, 0x19988181, 0x9ED14F4F, 0xA37FDCDC, 0x44662222, 0x547E2A2A, 0x3BAB9090, 0x0B838888, + 0x8CCA4646, 0xC729EEEE, 0x6BD3B8B8, 0x283C1414, 0xA779DEDE, 0xBCE25E5E, 0x161D0B0B, 0xAD76DBDB, + 0xDB3BE0E0, 0x64563232, 0x744E3A3A, 0x141E0A0A, 0x92DB4949, 0x0C0A0606, 0x486C2424, 0xB8E45C5C, + 0x9F5DC2C2, 0xBD6ED3D3, 0x43EFACAC, 0xC4A66262, 0x39A89191, 0x31A49595, 0xD337E4E4, 0xF28B7979, + 0xD532E7E7, 0x8B43C8C8, 0x6E593737, 0xDAB76D6D, 0x018C8D8D, 0xB164D5D5, 0x9CD24E4E, 0x49E0A9A9, + 0xD8B46C6C, 0xACFA5656, 0xF307F4F4, 0xCF25EAEA, 0xCAAF6565, 0xF48E7A7A, 0x47E9AEAE, 0x10180808, + 0x6FD5BABA, 0xF0887878, 0x4A6F2525, 0x5C722E2E, 0x38241C1C, 0x57F1A6A6, 0x73C7B4B4, 0x9751C6C6, + 0xCB23E8E8, 0xA17CDDDD, 0xE89C7474, 0x3E211F1F, 0x96DD4B4B, 0x61DCBDBD, 0x0D868B8B, 0x0F858A8A, + 0xE0907070, 0x7C423E3E, 0x71C4B5B5, 0xCCAA6666, 0x90D84848, 0x06050303, 0xF701F6F6, 0x1C120E0E, + 0xC2A36161, 0x6A5F3535, 0xAEF95757, 0x69D0B9B9, 0x17918686, 0x9958C1C1, 0x3A271D1D, 0x27B99E9E, + 0xD938E1E1, 0xEB13F8F8, 0x2BB39898, 0x22331111, 0xD2BB6969, 0xA970D9D9, 0x07898E8E, 0x33A79494, + 0x2DB69B9B, 0x3C221E1E, 0x15928787, 0xC920E9E9, 0x8749CECE, 0xAAFF5555, 0x50782828, 0xA57ADFDF, + 0x038F8C8C, 0x59F8A1A1, 0x09808989, 0x1A170D0D, 0x65DABFBF, 0xD731E6E6, 0x84C64242, 0xD0B86868, + 0x82C34141, 0x29B09999, 0x5A772D2D, 0x1E110F0F, 0x7BCBB0B0, 0xA8FC5454, 0x6DD6BBBB, 0x2C3A1616 +}; + +__device__ __forceinline__ +void aes_gpu_init_mt_256(uint32_t sharedMemory[4][256]) +{ + /* each thread startup will fill a uint32 */ + if (threadIdx.x < 256) { + uint32_t temp = __ldg(&d_AES0[threadIdx.x]); + sharedMemory[0][threadIdx.x] = temp; + sharedMemory[1][threadIdx.x] = ROL8(temp); + sharedMemory[2][threadIdx.x] = ROL16(temp); + sharedMemory[3][threadIdx.x] = ROR8(temp); + } +} + +__device__ __forceinline__ +void aes_gpu_init256(uint32_t sharedMemory[4][256]) +{ + /* each thread startup will fill a uint32 */ + uint32_t temp = __ldg(&d_AES0[threadIdx.x]); + sharedMemory[0][threadIdx.x] = temp; + sharedMemory[1][threadIdx.x] = ROL8(temp); + sharedMemory[2][threadIdx.x] = ROL16(temp); + sharedMemory[3][threadIdx.x] = ROR8(temp); +} + +__device__ __forceinline__ +void aes_gpu_init128(uint32_t sharedMemory[4][256]) +{ + /* each thread startup will fill 2 uint32 */ + uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]); + + sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x; + sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y; + sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x); + sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y); + sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x); + sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y); + sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x); + sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y); +} + +__device__ __forceinline__ +void aes_gpu_init_lt_256(uint32_t sharedMemory[4][256]) +{ + if (threadIdx.x < 128) { + /* each thread startup will fill 2 uint32 */ + uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]); + + sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x; + sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y; + sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x); + sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y); + sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x); + sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y); + sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x); + sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y); + } +} + +__device__ __forceinline__ +static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, + const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]); + y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)]; + y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)]; + y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); + + y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)]; + y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)]; + y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)]; +#ifdef INTENSIVE_GMF + y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); +#else + y2 ^= sharedMemory[3][__byte_perm(x1, 0, 0x4443)]; +#endif + + y0 ^= k0; + + y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]); + y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)]; + y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)]; + y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); + + y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)]; + y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)]; + y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)]; + y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); +} + +__device__ __forceinline__ +static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, + const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]); + y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)]; + y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)]; + y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); + + y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)]; + y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)]; + y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)]; + y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); + + y0 ^= k0; + + y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]); + y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)]; + y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)]; + y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); + + y3 ^= __ldg(&d_AES0[__byte_perm(x3, 0, 0x4440)]); + y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)]; + y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)]; + y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); +} + +__device__ __forceinline__ +static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, + uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)]; + y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)]; + y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)]; + y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); + +#ifdef INTENSIVE_GMF + y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]); +#else + y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)]; +#endif + y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)]; + y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)]; + y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); + + y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)]; + y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)]; + y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)]; + y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); + + y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)]; + y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)]; + y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)]; + y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); +} + +__device__ __forceinline__ +static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, + uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)]; + y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)]; + y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)]; + y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); + + y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]); + y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)]; + y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)]; + y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); + + y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)]; + y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)]; + y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)]; + y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); + + y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)]; + y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)]; + y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)]; + y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); +} + +__device__ __forceinline__ +static void AES_2ROUND(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0) +{ + uint32_t y0, y1, y2, y3; + + aes_round(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3); + + aes_round(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3); + + // hier werden wir ein carry brauchen (oder auch nicht) + k0++; +} + +__device__ __forceinline__ +static void AES_2ROUND_LDG(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0) +{ + uint32_t y0, y1, y2, y3; + + aes_round_LDG(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3); + + aes_round_LDG(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3); + + // hier werden wir ein carry brauchen (oder auch nicht) + k0++; +} + +__device__ __forceinline__ +static void AES_ROUND_NOKEY(const uint32_t sharedMemory[4][256], uint4* x) +{ + uint32_t y0, y1, y2, y3; + aes_round(sharedMemory, x->x, x->y, x->z, x->w, y0, y1, y2, y3); + + x->x = y0; + x->y = y1; + x->z = y2; + x->w = y3; +} + +__device__ __forceinline__ +static void KEY_EXPAND_ELT(const uint32_t sharedMemory[4][256], uint32_t *k) +{ + uint32_t y0, y1, y2, y3; + aes_round(sharedMemory, k[0], k[1], k[2], k[3], y0, y1, y2, y3); + + k[0] = y1; + k[1] = y2; + k[2] = y3; + k[3] = y0; +} diff --git a/tribus/cuda_echo512_final.cu b/tribus/cuda_echo512_final.cu new file mode 100644 index 0000000000..b68a9c6ea5 --- /dev/null +++ b/tribus/cuda_echo512_final.cu @@ -0,0 +1,285 @@ +/** + * Based on Provos Alexis work - 2016 FOR SM 5+ + * + * final touch by tpruvot for tribus - 09 2017 + */ +#include +#include +#include + +#define INTENSIVE_GMF +#include "tribus/cuda_echo512_aes.cuh" + +#ifdef __INTELLISENSE__ +#define __byte_perm(x, y, b) x +#define atomicExch(p,y) (*p) = y +#endif + +__device__ +static void echo_round(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0) +{ + // Big Sub Words + #pragma unroll 16 + for (int idx = 0; idx < 16; idx++) + AES_2ROUND(sharedMemory,W[(idx<<2) + 0], W[(idx<<2) + 1], W[(idx<<2) + 2], W[(idx<<2) + 3], k0); + + // Shift Rows + #pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint32_t t[4]; + /// 1, 5, 9, 13 + t[0] = W[i + 4]; + t[1] = W[i + 8]; + t[2] = W[i + 24]; + t[3] = W[i + 60]; + + W[i + 4] = W[i + 20]; + W[i + 8] = W[i + 40]; + W[i + 24] = W[i + 56]; + W[i + 60] = W[i + 44]; + + W[i + 20] = W[i + 36]; + W[i + 40] = t[1]; + W[i + 56] = t[2]; + W[i + 44] = W[i + 28]; + + W[i + 28] = W[i + 12]; + W[i + 12] = t[3]; + W[i + 36] = W[i + 52]; + W[i + 52] = t[0]; + } + // Mix Columns + #pragma unroll 4 + for (int i = 0; i < 4; i++) + { + #pragma unroll 4 + for (int idx = 0; idx < 64; idx += 16) + { + uint32_t a[4]; + a[0] = W[idx + i]; + a[1] = W[idx + i + 4]; + a[2] = W[idx + i + 8]; + a[3] = W[idx + i +12]; + + uint32_t ab = a[0] ^ a[1]; + uint32_t bc = a[1] ^ a[2]; + uint32_t cd = a[2] ^ a[3]; + + uint32_t t, t2, t3; + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1); + uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[idx + i] = bc ^ a[3] ^ abx; + W[idx + i + 4] = a[0] ^ cd ^ bcx; + W[idx + i + 8] = ab ^ a[3] ^ cdx; + W[idx + i +12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx); + } + } +} + +__global__ __launch_bounds__(256, 3) /* will force 80 registers */ +static void tribus_echo512_gpu_final(uint32_t threads, uint64_t *g_hash, uint32_t* resNonce, const uint64_t target) +{ + __shared__ uint32_t sharedMemory[4][256]; + + aes_gpu_init256(sharedMemory); + + const uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //8-12 + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //21-25 + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751,0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //34-38 + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7,0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + //58-61 + }; + uint32_t k0; + uint32_t h[16]; + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t *hash = (uint32_t*)&g_hash[thread<<3]; + + *(uint2x4*)&h[0] = __ldg4((uint2x4*)&hash[0]); + *(uint2x4*)&h[8] = __ldg4((uint2x4*)&hash[8]); + + uint64_t backup = *(uint64_t*)&h[6]; + + k0 = 512 + 8; + + #pragma unroll 4 + for (uint32_t idx = 0; idx < 16; idx += 4) + AES_2ROUND(sharedMemory,h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0); + + k0 += 4; + + uint32_t W[64]; + + #pragma unroll 4 + for (uint32_t i = 0; i < 4; i++) + { + uint32_t a = P[i]; + uint32_t b = P[i + 4]; + uint32_t c = h[i + 8]; + uint32_t d = P[i + 8]; + + uint32_t ab = a ^ b; + uint32_t bc = b ^ c; + uint32_t cd = c ^ d; + + uint32_t t = ((a ^ b) & 0x80808080); + uint32_t t2 = ((b ^ c) & 0x80808080); + uint32_t t3 = ((c ^ d) & 0x80808080); + + uint32_t abx = ((t >> 7) * 27U) ^ ((ab^t) << 1); + uint32_t bcx = ((t2 >> 7) * 27U) ^ ((bc^t2) << 1); + uint32_t cdx = ((t3 >> 7) * 27U) ^ ((cd^t3) << 1); + + W[0 + i] = bc ^ d ^ abx; + W[4 + i] = a ^ cd ^ bcx; + W[8 + i] = ab ^ d ^ cdx; + W[12+ i] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = P[12 + i]; + b = h[i + 4]; + c = P[12 + i + 4]; + d = P[12 + i + 8]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[16 + i] = abx ^ bc ^ d; + W[16 + i + 4] = bcx ^ a ^ cd; + W[16 + i + 8] = cdx ^ ab ^ d; + W[16 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = h[i]; + b = P[24 + i]; + c = P[24 + i + 4]; + d = P[24 + i + 8]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[32 + i] = abx ^ bc ^ d; + W[32 + i + 4] = bcx ^ a ^ cd; + W[32 + i + 8] = cdx ^ ab ^ d; + W[32 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = P[36 + i ]; + b = P[36 + i + 4]; + c = P[36 + i + 8]; + d = h[i + 12]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[48 + i] = abx ^ bc ^ d; + W[48 + i + 4] = bcx ^ a ^ cd; + W[48 + i + 8] = cdx ^ ab ^ d; + W[48 + i +12] = abx ^ bcx ^ cdx ^ ab ^ c; + } + + for (int k = 1; k < 9; k++) + echo_round(sharedMemory,W,k0); + + // Big Sub Words + uint32_t y0, y1, y2, y3; +// AES_2ROUND(sharedMemory,W[ 0], W[ 1], W[ 2], W[ 3], k0); + aes_round(sharedMemory, W[ 0], W[ 1], W[ 2], W[ 3], k0, y0, y1, y2, y3); + aes_round(sharedMemory, y0, y1, y2, y3, W[ 0], W[ 1], W[ 2], W[ 3]); + + aes_round(sharedMemory, W[ 4], W[ 5], W[ 6], W[ 7], k0, y0, y1, y2, y3); + aes_round(sharedMemory, y0, y1, y2, y3, W[ 4], W[ 5], W[ 6], W[ 7]); + aes_round(sharedMemory, W[ 8], W[ 9], W[10], W[11], k0, y0, y1, y2, y3); + aes_round(sharedMemory, y0, y1, y2, y3, W[ 8], W[ 9], W[10], W[11]); + + aes_round(sharedMemory, W[20], W[21], W[22], W[23], k0, y0, y1, y2, y3); + aes_round(sharedMemory, y0, y1, y2, y3, W[20], W[21], W[22], W[23]); + aes_round(sharedMemory, W[28], W[29], W[30], W[31], k0, y0, y1, y2, y3); + aes_round(sharedMemory, y0, y1, y2, y3, W[28], W[29], W[30], W[31]); + + aes_round(sharedMemory, W[32], W[33], W[34], W[35], k0, y0, y1, y2, y3); + aes_round(sharedMemory, y0, y1, y2, y3, W[32], W[33], W[34], W[35]); + aes_round(sharedMemory, W[40], W[41], W[42], W[43], k0, y0, y1, y2, y3); + aes_round(sharedMemory, y0, y1, y2, y3, W[40], W[41], W[42], W[43]); + + aes_round(sharedMemory, W[52], W[53], W[54], W[55], k0, y0, y1, y2, y3); + aes_round(sharedMemory, y0, y1, y2, y3, W[52], W[53], W[54], W[55]); + aes_round(sharedMemory, W[60], W[61], W[62], W[63], k0, y0, y1, y2, y3); + aes_round(sharedMemory, y0, y1, y2, y3, W[60], W[61], W[62], W[63]); + + uint32_t bc = W[22] ^ W[42]; + uint32_t t2 = (bc & 0x80808080); + W[ 6] = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + + bc = W[23] ^ W[43]; + t2 = (bc & 0x80808080); + W[ 7] = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + + bc = W[10] ^ W[54]; + t2 = (bc & 0x80808080); + W[38] = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + + bc = W[11] ^ W[55]; + t2 = (bc & 0x80808080); + W[39] = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + + uint64_t check = backup ^ *(uint64_t*)&W[2] ^ *(uint64_t*)&W[6] ^ *(uint64_t*)&W[10] ^ *(uint64_t*)&W[30] + ^ *(uint64_t*)&W[34] ^ *(uint64_t*)&W[38] ^ *(uint64_t*)&W[42] ^ *(uint64_t*)&W[62]; + + if(check <= target){ + uint32_t tmp = atomicExch(&resNonce[0], thread); + if (tmp != UINT32_MAX) + resNonce[1] = tmp; + } + } +} + +__host__ +void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target) +{ + const uint32_t threadsperblock = 256; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + tribus_echo512_gpu_final <<>> (threads, (uint64_t*)d_hash, d_resNonce, target); +} diff --git a/tribus.cu b/tribus/tribus.cu similarity index 71% rename from tribus.cu rename to tribus/tribus.cu index ed82850085..4516e7d69c 100644 --- a/tribus.cu +++ b/tribus/tribus.cu @@ -1,7 +1,7 @@ /** * Tribus Algo for Denarius * - * tpruvot@github 06 2017 - GPLv3 + * tpruvot@github 09 2017 - GPLv3 * */ extern "C" { @@ -16,9 +16,10 @@ extern "C" { void jh512_setBlock_80(int thr_id, uint32_t *endiandata); void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash); +void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target); static uint32_t *d_hash[MAX_GPUS]; - +static uint32_t *d_resNonce[MAX_GPUS]; // cpu hash @@ -46,6 +47,7 @@ extern "C" void tribus_hash(void *state, const void *input) } static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done) { @@ -63,7 +65,8 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); + int dev_id = device_map[thr_id]; + cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); // reduce cpu usage @@ -74,10 +77,15 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce quark_jh512_cpu_init(thr_id, throughput); quark_keccak512_cpu_init(thr_id, throughput); - x11_echo512_cpu_init(thr_id, throughput); + + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); // char[64] work space for hashes results CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t))); cuda_check_cpu_init(thr_id, throughput); init[thr_id] = true; @@ -87,33 +95,43 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce be32enc(&endiandata[k], pdata[k]); jh512_setBlock_80(thr_id, endiandata); - cuda_check_cpu_setTarget(ptarget); + if (use_compat_kernels[thr_id]) + cuda_check_cpu_setTarget(ptarget); + else + cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t)); work->valid_nonces = 0; do { int order = 1; - - // Hash with CUDA jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + if (use_compat_kernels[thr_id]) { + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + work->nonces[1] = UINT32_MAX; + } else { + tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6])); + cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); + } *hashes_done = pdata[19] - first_nonce + throughput; - work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (work->nonces[0] != UINT32_MAX) { - const uint32_t Htarg = ptarget[7]; uint32_t _ALIGN(64) vhash[8]; + const uint32_t Htarg = ptarget[7]; + const uint32_t startNounce = pdata[19]; + if (!use_compat_kernels[thr_id]) work->nonces[0] += startNounce; be32enc(&endiandata[19], work->nonces[0]); tribus_hash(vhash, endiandata); if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { work->valid_nonces = 1; work_set_target_ratio(work, vhash); - work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); - if (work->nonces[1] != 0) { + if (work->nonces[1] != UINT32_MAX) { + work->nonces[1] += startNounce; be32enc(&endiandata[19], work->nonces[1]); tribus_hash(vhash, endiandata); bn_set_target_ratio(work, vhash, 1); @@ -127,7 +145,8 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce else if (vhash[7] > Htarg) { gpu_increment_reject(thr_id); if (!opt_quiet) - gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t)); pdata[19] = work->nonces[0] + 1; continue; } @@ -144,7 +163,6 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce out: // *hashes_done = pdata[19] - first_nonce; - return work->valid_nonces; } @@ -157,8 +175,8 @@ extern "C" void free_tribus(int thr_id) cudaThreadSynchronize(); cudaFree(d_hash[thr_id]); + cudaFree(d_resNonce[thr_id]); - quark_groestl512_cpu_free(thr_id); cuda_check_cpu_free(thr_id); init[thr_id] = false; diff --git a/util.cpp b/util.cpp index 2c6a1e9469..5b87d9652a 100644 --- a/util.cpp +++ b/util.cpp @@ -2211,6 +2211,9 @@ void print_hash_tests(void) hmq17hash(&hash[0], &buf[0]); printpfx("hmq1725", hash); + hsr_hash(&hash[0], &buf[0]); + printpfx("hsr", hash); + jha_hash(&hash[0], &buf[0]); printpfx("jha", hash); @@ -2245,6 +2248,9 @@ void print_hash_tests(void) pentablakehash(&hash[0], &buf[0]); printpfx("pentablake", hash); + phihash(&hash[0], &buf[0]); + printpfx("phi", hash); + quarkhash(&hash[0], &buf[0]); printpfx("quark", hash); diff --git a/x11/c11.cu b/x11/c11.cu index 5dee17455a..8f8f6663b2 100644 --- a/x11/c11.cu +++ b/x11/c11.cu @@ -18,10 +18,13 @@ extern "C" #include "cuda_helper.h" #include "cuda_x11.h" +void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target); + #include #include static uint32_t *d_hash[MAX_GPUS]; +static uint32_t *d_resNonce[MAX_GPUS]; // Flax/Chaincoin C11 CPU Hash extern "C" void c11hash(void *output, const void *input) @@ -103,6 +106,7 @@ extern "C" void c11hash(void *output, const void *input) #endif static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { @@ -118,7 +122,8 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); + int dev_id = device_map[thr_id]; + cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); // reduce cpu usage @@ -127,6 +132,9 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u } gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + quark_blake512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput); @@ -135,11 +143,13 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u quark_jh512_cpu_init(thr_id, throughput); x11_luffaCubehash512_cpu_init(thr_id, throughput); x11_shavite512_cpu_init(thr_id, throughput); - x11_echo512_cpu_init(thr_id, throughput); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); if (x11_simd512_cpu_init(thr_id, throughput) != 0) { return 0; } - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); // why 64 ? + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); + CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t))); cuda_check_cpu_init(thr_id, throughput); @@ -151,7 +161,10 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u be32enc(&endiandata[k], pdata[k]); quark_blake512_cpu_setBlock_80(thr_id, endiandata); - cuda_check_cpu_setTarget(ptarget); + if (use_compat_kernels[thr_id]) + cuda_check_cpu_setTarget(ptarget); + else + cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t)); do { int order = 0; @@ -175,24 +188,32 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u TRACE("shavite:"); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("simd :"); - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("echo => "); + + if (use_compat_kernels[thr_id]) { + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + work->nonces[1] = UINT32_MAX; + } else { + tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6])); + cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); + } *hashes_done = pdata[19] - first_nonce + throughput; - work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); if (work->nonces[0] != UINT32_MAX) { - const uint32_t Htarg = ptarget[7]; uint32_t _ALIGN(64) vhash[8]; + const uint32_t Htarg = ptarget[7]; + const uint32_t startNounce = pdata[19]; + if (!use_compat_kernels[thr_id]) work->nonces[0] += startNounce; be32enc(&endiandata[19], work->nonces[0]); c11hash(vhash, endiandata); if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { work->valid_nonces = 1; work_set_target_ratio(work, vhash); - work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); - if (work->nonces[1] != 0) { + if (work->nonces[1] != UINT32_MAX) { + work->nonces[1] += startNounce; be32enc(&endiandata[19], work->nonces[1]); c11hash(vhash, endiandata); bn_set_target_ratio(work, vhash, 1); @@ -206,7 +227,8 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u else if (vhash[7] > Htarg) { gpu_increment_reject(thr_id); if (!opt_quiet) - gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t)); pdata[19] = work->nonces[0] + 1; continue; } @@ -234,6 +256,8 @@ extern "C" void free_c11(int thr_id) cudaThreadSynchronize(); cudaFree(d_hash[thr_id]); + cudaFree(d_resNonce[thr_id]); + quark_blake512_cpu_free(thr_id); quark_groestl512_cpu_free(thr_id); x11_simd512_cpu_free(thr_id); diff --git a/x11/cuda_streebog.cu b/x11/cuda_streebog.cu index 228c691312..d6e3685e62 100644 --- a/x11/cuda_streebog.cu +++ b/x11/cuda_streebog.cu @@ -806,10 +806,11 @@ void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash) #define T6(x) shared[6][x] #define T7(x) shared[7][x] +// Streebog final for Veltor and skunk on SM 3.x __constant__ uint64_t target64[4]; __host__ -void streebog_set_target(const uint32_t* ptarget) +void streebog_sm3_set_target(uint32_t* ptarget) { cudaMemcpyToSymbol(target64,ptarget,4*sizeof(uint64_t),0,cudaMemcpyHostToDevice); } @@ -995,7 +996,7 @@ void streebog_gpu_hash_64_final(uint64_t *g_hash, uint32_t* resNonce) } __host__ -void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce) +void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash,uint32_t* d_resNonce) { dim3 grid((threads + TPB-1) / TPB); dim3 block(TPB); diff --git a/x11/cuda_streebog_maxwell.cu b/x11/cuda_streebog_maxwell.cu new file mode 100644 index 0000000000..6a06332933 --- /dev/null +++ b/x11/cuda_streebog_maxwell.cu @@ -0,0 +1,309 @@ +/* + * Streebog GOST R 34.10-2012 CUDA implementation. + * + * https://tools.ietf.org/html/rfc6986 + * https://en.wikipedia.org/wiki/Streebog + * + * ==========================(LICENSE BEGIN)============================ + * + * @author Tanguy Pruvot - 2015 + * @author Alexis Provos - 2016 + */ + +// Further improved with shared memory partial utilization +// Tested under CUDA7.5 toolkit for cp 5.0/5.2 + +//#include +#include +#include +#include + +#include "streebog_arrays.cuh" + +//#define FULL_UNROLL +__device__ __forceinline__ +static void GOST_FS(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state) +{ + return_state[0] = __ldg(&T02[__byte_perm(state[7].x,0,0x44440)]) + ^ shared[1][__byte_perm(state[6].x,0,0x44440)] + ^ shared[2][__byte_perm(state[5].x,0,0x44440)] + ^ shared[3][__byte_perm(state[4].x,0,0x44440)] + ^ shared[4][__byte_perm(state[3].x,0,0x44440)] + ^ shared[5][__byte_perm(state[2].x,0,0x44440)] + ^ shared[6][__byte_perm(state[1].x,0,0x44440)] + ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]); + + return_state[1] = __ldg(&T02[__byte_perm(state[7].x,0,0x44441)]) + ^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)]) + ^ shared[2][__byte_perm(state[5].x,0,0x44441)] + ^ shared[3][__byte_perm(state[4].x,0,0x44441)] + ^ shared[4][__byte_perm(state[3].x,0,0x44441)] + ^ shared[5][__byte_perm(state[2].x,0,0x44441)] + ^ shared[6][__byte_perm(state[1].x,0,0x44441)] + ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)]); + + return_state[2] = __ldg(&T02[__byte_perm(state[7].x,0,0x44442)]) + ^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)]) + ^ shared[2][__byte_perm(state[5].x,0,0x44442)] + ^ shared[3][__byte_perm(state[4].x,0,0x44442)] + ^ shared[4][__byte_perm(state[3].x,0,0x44442)] + ^ shared[5][__byte_perm(state[2].x,0,0x44442)] + ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)]) + ^ shared[6][__byte_perm(state[1].x,0,0x44442)]; + + return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)]) + ^ shared[1][__byte_perm(state[6].x,0,0x44443)] + ^ shared[2][__byte_perm(state[5].x,0,0x44443)] + ^ shared[3][__byte_perm(state[4].x,0,0x44443)] + ^ __ldg(&T42[__byte_perm(state[3].x,0,0x44443)]) + ^ shared[5][__byte_perm(state[2].x,0,0x44443)] + ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)]) + ^ shared[6][__byte_perm(state[1].x,0,0x44443)]; + + return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)]) + ^ shared[1][__byte_perm(state[6].y,0,0x44440)] + ^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)]) + ^ shared[3][__byte_perm(state[4].y,0,0x44440)] + ^ shared[4][__byte_perm(state[3].y,0,0x44440)] + ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)]) + ^ shared[5][__byte_perm(state[2].y,0,0x44440)] + ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)]); + + return_state[5] = __ldg(&T02[__byte_perm(state[7].y,0,0x44441)]) + ^ shared[2][__byte_perm(state[5].y,0,0x44441)] + ^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)]) + ^ shared[3][__byte_perm(state[4].y,0,0x44441)] + ^ shared[4][__byte_perm(state[3].y,0,0x44441)] + ^ shared[5][__byte_perm(state[2].y,0,0x44441)] + ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)]) + ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)]); + + return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)]) + ^ shared[1][__byte_perm(state[6].y,0,0x44442)] + ^ shared[2][__byte_perm(state[5].y,0,0x44442)] + ^ shared[3][__byte_perm(state[4].y,0,0x44442)] + ^ shared[4][__byte_perm(state[3].y,0,0x44442)] + ^ shared[5][__byte_perm(state[2].y,0,0x44442)] + ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)]) + ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)]); + + return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)]) + ^ __ldg(&T12[__byte_perm(state[6].y,0,0x44443)]) + ^ shared[2][__byte_perm(state[5].y,0,0x44443)] + ^ shared[3][__byte_perm(state[4].y,0,0x44443)] + ^ shared[4][__byte_perm(state[3].y,0,0x44443)] + ^ shared[5][__byte_perm(state[2].y,0,0x44443)] + ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)]) + ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)]); +} + +__device__ __forceinline__ +static void GOST_FS_LDG(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state) +{ + return_state[0] = __ldg(&T02[__byte_perm(state[7].x,0,0x44440)]) + ^ __ldg(&T12[__byte_perm(state[6].x,0,0x44440)]) + ^ shared[2][__byte_perm(state[5].x,0,0x44440)] + ^ shared[3][__byte_perm(state[4].x,0,0x44440)] + ^ shared[4][__byte_perm(state[3].x,0,0x44440)] + ^ shared[5][__byte_perm(state[2].x,0,0x44440)] + ^ shared[6][__byte_perm(state[1].x,0,0x44440)] + ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]); + + return_state[1] = __ldg(&T02[__byte_perm(state[7].x,0,0x44441)]) + ^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)]) + ^ shared[2][__byte_perm(state[5].x,0,0x44441)] + ^ shared[3][__byte_perm(state[4].x,0,0x44441)] + ^ shared[4][__byte_perm(state[3].x,0,0x44441)] + ^ shared[5][__byte_perm(state[2].x,0,0x44441)] + ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)]) + ^ shared[6][__byte_perm(state[1].x,0,0x44441)]; + + return_state[2] = __ldg(&T02[__byte_perm(state[7].x,0,0x44442)]) + ^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)]) + ^ shared[2][__byte_perm(state[5].x,0,0x44442)] + ^ shared[3][__byte_perm(state[4].x,0,0x44442)] + ^ shared[4][__byte_perm(state[3].x,0,0x44442)] + ^ shared[5][__byte_perm(state[2].x,0,0x44442)] + ^ shared[6][__byte_perm(state[1].x,0,0x44442)] + ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)]); + + return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)]) + ^ __ldg(&T12[__byte_perm(state[6].x,0,0x44443)]) + ^ shared[2][__byte_perm(state[5].x,0,0x44443)] + ^ shared[3][__byte_perm(state[4].x,0,0x44443)] + ^ shared[4][__byte_perm(state[3].x,0,0x44443)] + ^ shared[5][__byte_perm(state[2].x,0,0x44443)] + ^ shared[6][__byte_perm(state[1].x,0,0x44443)] + ^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)]); + + return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)]) + ^ shared[1][__byte_perm(state[6].y,0,0x44440)] + ^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)]) + ^ shared[3][__byte_perm(state[4].y,0,0x44440)] + ^ shared[4][__byte_perm(state[3].y,0,0x44440)] + ^ shared[5][__byte_perm(state[2].y,0,0x44440)] + ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)]) + ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)]); + + return_state[5] = __ldg(&T02[__byte_perm(state[7].y,0,0x44441)]) + ^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)]) + ^ shared[2][__byte_perm(state[5].y,0,0x44441)] + ^ shared[3][__byte_perm(state[4].y,0,0x44441)] + ^ shared[4][__byte_perm(state[3].y,0,0x44441)] + ^ shared[5][__byte_perm(state[2].y,0,0x44441)] + ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)]) + ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)]); + + return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)]) + ^ __ldg(&T12[__byte_perm(state[6].y,0,0x44442)]) + ^ __ldg(&T22[__byte_perm(state[5].y,0,0x44442)]) + ^ shared[3][__byte_perm(state[4].y,0,0x44442)] + ^ shared[4][__byte_perm(state[3].y,0,0x44442)] + ^ shared[5][__byte_perm(state[2].y,0,0x44442)] + ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)]) + ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)]); + + return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)]) + ^ shared[1][__byte_perm(state[6].y,0,0x44443)] + ^ __ldg(&T22[__byte_perm(state[5].y,0,0x44443)]) + ^ shared[3][__byte_perm(state[4].y,0,0x44443)] + ^ shared[4][__byte_perm(state[3].y,0,0x44443)] + ^ shared[5][__byte_perm(state[2].y,0,0x44443)] + ^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)]) + ^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)]); +} + +__device__ __forceinline__ +static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uint2 *const __restrict__ state) +{ + uint2 t[8]; + for(int i=0; i<12; i++){ + GOST_FS(shared,state, t); + + #pragma unroll 8 + for(int j=0;j<8;j++) + K[ j] ^= *(uint2*)&CC[i][j]; + + #pragma unroll 8 + for(int j=0;j<8;j++) + state[ j] = t[ j]; + + GOST_FS_LDG(shared,K, t); + + #pragma unroll 8 + for(int j=0;j<8;j++) + state[ j]^= t[ j]; + + #pragma unroll 8 + for(int j=0;j<8;j++) + K[ j] = t[ j]; + } +} + +#define TPB 256 +__global__ +#if __CUDA_ARCH__ > 500 +__launch_bounds__(TPB, 3) +#else +__launch_bounds__(TPB, 3) +#endif +void streebog_gpu_hash_64_maxwell(uint64_t *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + uint2 buf[8], t[8], temp[8], K0[8], hash[8]; + + __shared__ uint2 shared[8][256]; + shared[0][threadIdx.x] = __ldg(&T02[threadIdx.x]); + shared[1][threadIdx.x] = __ldg(&T12[threadIdx.x]); + shared[2][threadIdx.x] = __ldg(&T22[threadIdx.x]); + shared[3][threadIdx.x] = __ldg(&T32[threadIdx.x]); + shared[4][threadIdx.x] = __ldg(&T42[threadIdx.x]); + shared[5][threadIdx.x] = __ldg(&T52[threadIdx.x]); + shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]); + shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]); + + uint64_t* inout = &g_hash[thread<<3]; + + *(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]); + *(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]); + + __threadfence_block(); + + K0[0] = vectorize(0x74a5d4ce2efc83b3); + + #pragma unroll 8 + for(int i=0;i<8;i++){ + buf[ i] = K0[ 0] ^ hash[ i]; + } + + for(int i=0; i<12; i++){ + GOST_FS(shared, buf, temp); + #pragma unroll 8 + for(uint32_t j=0;j<8;j++){ + buf[ j] = temp[ j] ^ *(uint2*)&precomputed_values[i][j]; + } + } + #pragma unroll 8 + for(int j=0;j<8;j++){ + buf[ j]^= hash[ j]; + } + #pragma unroll 8 + for(int j=0;j<8;j++){ + K0[ j] = buf[ j]; + } + + K0[7].y ^= 0x00020000; + + GOST_FS(shared, K0, t); + + #pragma unroll 8 + for(int i=0;i<8;i++) + K0[ i] = t[ i]; + + t[7].y ^= 0x01000000; + + GOST_E12(shared, K0, t); + + #pragma unroll 8 + for(int j=0;j<8;j++) + buf[ j] ^= t[ j]; + + buf[7].y ^= 0x01000000; + + GOST_FS(shared, buf,K0); + + buf[7].y ^= 0x00020000; + + #pragma unroll 8 + for(int j=0;j<8;j++) + t[ j] = K0[ j]; + + t[7].y ^= 0x00020000; + + GOST_E12(shared, K0, t); + + #pragma unroll 8 + for(int j=0;j<8;j++) + buf[ j] ^= t[ j]; + + GOST_FS(shared, buf,K0); // K = F(h) + + hash[7]+= vectorize(0x0100000000000000); + + #pragma unroll 8 + for(int j=0;j<8;j++) + t[ j] = K0[ j] ^ hash[ j]; + + GOST_E12(shared, K0, t); + + *(uint2x4*)&inout[0] = *(uint2x4*)&t[0] ^ *(uint2x4*)&hash[0] ^ *(uint2x4*)&buf[0]; + *(uint2x4*)&inout[4] = *(uint2x4*)&t[4] ^ *(uint2x4*)&hash[4] ^ *(uint2x4*)&buf[4]; +} + +__host__ +void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash) +{ + dim3 grid((threads + TPB-1) / TPB); + dim3 block(TPB); + streebog_gpu_hash_64_maxwell <<>> ((uint64_t*)d_hash); +} diff --git a/x11/phi.cu b/x11/phi.cu new file mode 100644 index 0000000000..ab1f30833c --- /dev/null +++ b/x11/phi.cu @@ -0,0 +1,223 @@ +// +// +// PHI1612 algo +// Skein + JH + CubeHash + Fugue + Gost + Echo +// +// Implemented by anorganix @ bitcointalk on 01.10.2017 +// Feel free to send some satoshis to 1Bitcoin8tfbtGAQNFxDRUVUfFgFWKoWi9 +// +// + +extern "C" { +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_fugue.h" +#include "sph/sph_streebog.h" +#include "sph/sph_echo.h" +} + +#include "miner.h" +#include "cuda_helper.h" +#include "cuda_x11.h" + +extern void skein512_cpu_setBlock_80(void *pdata); +extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap); +extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); +extern void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash); + +extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); +extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_free(int thr_id); + +extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target); + +#include +#include + +static uint32_t *d_hash[MAX_GPUS]; +static uint32_t *d_resNonce[MAX_GPUS]; + +extern "C" void phihash(void *output, const void *input) +{ + unsigned char _ALIGN(128) hash[128] = { 0 }; + + sph_skein512_context ctx_skein; + sph_jh512_context ctx_jh; + sph_cubehash512_context ctx_cubehash; + sph_fugue512_context ctx_fugue; + sph_gost512_context ctx_gost; + sph_echo512_context ctx_echo; + + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, input, 80); + sph_skein512_close(&ctx_skein, (void*)hash); + + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, (const void*)hash, 64); + sph_jh512_close(&ctx_jh, (void*)hash); + + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, (const void*)hash, 64); + sph_cubehash512_close(&ctx_cubehash, (void*)hash); + + sph_fugue512_init(&ctx_fugue); + sph_fugue512(&ctx_fugue, (const void*)hash, 64); + sph_fugue512_close(&ctx_fugue, (void*)hash); + + sph_gost512_init(&ctx_gost); + sph_gost512(&ctx_gost, (const void*)hash, 64); + sph_gost512_close(&ctx_gost, (void*)hash); + + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, (const void*)hash, 64); + sph_echo512_close(&ctx_echo, (void*)hash); + + memcpy(output, hash, 32); +} + +#define _DEBUG_PREFIX "phi" +#include "cuda_debug.cuh" + +static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + + const uint32_t first_nonce = pdata[19]; + const int dev_id = device_map[thr_id]; + + int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 19 : 18; // 2^18 = 262144 cuda threads + if (device_sm[dev_id] >= 600) intensity = 20; + + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (opt_benchmark) + ptarget[7] = 0xf; + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + + CUDA_LOG_ERROR(); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + + quark_skein512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + x11_cubehash512_cpu_init(thr_id, throughput); + x13_fugue512_cpu_init(thr_id, throughput); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput), -1); + CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], 2 * sizeof(uint32_t))); + + cuda_check_cpu_init(thr_id, throughput); + init[thr_id] = true; + } + + uint32_t endiandata[20]; + + for (int k = 0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + skein512_cpu_setBlock_80((void*)endiandata); + if (use_compat_kernels[thr_id]) + cuda_check_cpu_setTarget(ptarget); + else + cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t)); + + do { + int order = 0; + + skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + if (use_compat_kernels[thr_id]) { + streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + } else { + streebog_hash_64_maxwell(thr_id, throughput, d_hash[thr_id]); + tribus_echo512_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id], AS_U64(&ptarget[6])); + cudaMemcpy(&work->nonces[0], d_resNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); + } + + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + const uint32_t startNonce = pdata[19]; + uint32_t _ALIGN(64) vhash[8]; + if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce; + be32enc(&endiandata[19], work->nonces[0]); + phihash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work_set_target_ratio(work, vhash); + *hashes_done = pdata[19] - first_nonce + throughput; + //work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + //if (work->nonces[1] != 0) { + if (work->nonces[1] != UINT32_MAX) { + work->nonces[1] += startNonce; + be32enc(&endiandata[19], work->nonces[1]); + phihash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } + else { + pdata[19] = work->nonces[0] + 1; // cursor + } + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + cudaMemset(d_resNonce[thr_id], 0xFF, 2 * sizeof(uint32_t)); + pdata[19] = work->nonces[0] + 1; + continue; + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_phi(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + cudaFree(d_hash[thr_id]); + cudaFree(d_resNonce[thr_id]); + x13_fugue512_cpu_free(thr_id); + + cuda_check_cpu_free(thr_id); + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/x11/sib.cu b/x11/sib.cu index 158f85e287..c437523d03 100644 --- a/x11/sib.cu +++ b/x11/sib.cu @@ -18,6 +18,7 @@ extern "C" { #include "cuda_x11.h" extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); +extern void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash); #include #include @@ -98,6 +99,7 @@ extern "C" void sibhash(void *output, const void *input) #include "cuda_debug.cuh" static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { @@ -124,6 +126,9 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u } gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + quark_blake512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput); @@ -166,7 +171,10 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u TRACE("jh512 :"); quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("keccak :"); - streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + if (use_compat_kernels[thr_id]) + streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + else + streebog_hash_64_maxwell(thr_id, throughput, d_hash[thr_id]); TRACE("gost :"); x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); TRACE("luffa+c:"); diff --git a/skunk/streebog_arrays.cuh b/x11/streebog_arrays.cuh similarity index 100% rename from skunk/streebog_arrays.cuh rename to x11/streebog_arrays.cuh diff --git a/x11/veltor.cu b/x11/veltor.cu index be05f5a7d5..7bc1e18dab 100644 --- a/x11/veltor.cu +++ b/x11/veltor.cu @@ -10,11 +10,17 @@ extern "C" { #include "cuda_x11.h" extern void skein512_cpu_setBlock_80(void *pdata); -extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); -extern void streebog_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); -extern void streebog_set_target(const uint32_t* ptarget); + +// for SM3.x +extern void streebog_sm3_set_target(uint32_t* ptarget); +extern void streebog_sm3_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); + +// for latest cards only +extern void skunk_cpu_init(int thr_id, uint32_t threads); +extern void skunk_streebog_set_target(uint32_t* ptarget); +extern void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t* d_resNonce); #include #include @@ -23,7 +29,7 @@ extern void streebog_set_target(const uint32_t* ptarget); static uint32_t *d_hash[MAX_GPUS]; static uint32_t *d_resNonce[MAX_GPUS]; -// veltorcoin CPU Hash +// veltor CPU Hash extern "C" void veltorhash(void *output, const void *input) { unsigned char _ALIGN(128) hash[128] = { 0 }; @@ -53,6 +59,7 @@ extern "C" void veltorhash(void *output, const void *input) } static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { @@ -80,7 +87,9 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce } gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); - quark_skein512_cpu_init(thr_id, throughput); + skunk_cpu_init(thr_id, throughput); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + x11_shavite512_cpu_init(thr_id, throughput); CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); @@ -97,14 +106,20 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce skein512_cpu_setBlock_80(endiandata); cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)); - streebog_set_target(ptarget); + if(use_compat_kernels[thr_id]) + streebog_sm3_set_target(ptarget); + else + skunk_streebog_set_target(ptarget); do { int order = 0; skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - streebog_cpu_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); + if(use_compat_kernels[thr_id]) + streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); + else + skunk_cuda_streebog(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); cudaMemcpy(h_resNonce, d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost); diff --git a/x13/cuda_hsr_sm3.cu b/x13/cuda_hsr_sm3.cu new file mode 100644 index 0000000000..5ce018626e --- /dev/null +++ b/x13/cuda_hsr_sm3.cu @@ -0,0 +1,139 @@ +#include +#include +#include + +#include +#include + +#define F(x, y, z) (((x) ^ (y) ^ (z))) +#define FF(x, y, z) (((x) & (y)) | ((x) & (z)) | ((y) & (z))) +#define GG(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) + +#define P0(x) x ^ ROTL32(x, 9) ^ ROTL32(x, 17) +#define P1(x) x ^ ROTL32(x, 15) ^ ROTL32(x, 23) + +static __forceinline__ __device__ +void sm3_compress2(uint32_t digest[8], const uint32_t pblock[16]) +{ + uint32_t tt1, tt2, i, t, ss1, ss2, x, y; + uint32_t w[68]; + uint32_t a = digest[0]; + uint32_t b = digest[1]; + uint32_t c = digest[2]; + uint32_t d = digest[3]; + uint32_t e = digest[4]; + uint32_t f = digest[5]; + uint32_t g = digest[6]; + uint32_t h = digest[7]; + + #pragma unroll + for (i = 0; i<16; i++) { + w[i] = cuda_swab32(pblock[i]); + } + + for (i = 16; i<68; i++) { + x = ROTL32(w[i - 3], 15); + y = ROTL32(w[i - 13], 7); + + x ^= w[i - 16]; + x ^= w[i - 9]; + y ^= w[i - 6]; + + w[i] = P1(x) ^ y; + } + + for (i = 0; i<64; i++) { + + t = (i < 16) ? 0x79cc4519 : 0x7a879d8a; + + ss2 = ROTL32(a, 12); + ss1 = ROTL32(ss2 + e + ROTL32(t, i), 7); + ss2 ^= ss1; + + tt1 = d + ss2 + (w[i] ^ w[i + 4]); + tt2 = h + ss1 + w[i]; + + if (i < 16) { + tt1 += F(a, b, c); + tt2 += F(e, f, g); + } + else { + tt1 += FF(a, b, c); + tt2 += GG(e, f, g); + } + d = c; + c = ROTL32(b, 9); + b = a; + a = tt1; + h = g; + g = ROTL32(f, 19); + f = e; + e = P0(tt2); + } + + digest[0] ^= a; + digest[1] ^= b; + digest[2] ^= c; + digest[3] ^= d; + digest[4] ^= e; + digest[5] ^= f; + digest[6] ^= g; + digest[7] ^= h; +} + +/***************************************************/ +// GPU Hash Function +__global__ +void sm3_gpu_hash_64(const uint32_t threads, uint32_t *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + if (thread < threads) + { + const size_t hashPosition = thread; + + uint32_t digest[8]; + digest[0] = 0x7380166F; + digest[1] = 0x4914B2B9; + digest[2] = 0x172442D7; + digest[3] = 0xDA8A0600; + digest[4] = 0xA96F30BC; + digest[5] = 0x163138AA; + digest[6] = 0xE38DEE4D; + digest[7] = 0xB0FB0E4E; + + uint32_t *pHash = &g_hash[hashPosition << 4]; + sm3_compress2(digest, pHash); + + uint32_t block[16]; + block[0] = 0x80; + + #pragma unroll + for (int i = 1; i < 14; i++) + block[i] = 0; + + // count + block[14] = cuda_swab32(1 >> 23); + block[15] = cuda_swab32((1 << 9) + (0 << 3)); + + sm3_compress2(digest, block); + + for (int i = 0; i < 8; i++) + pHash[i] = cuda_swab32(digest[i]); + + for (int i = 8; i < 16; i++) + pHash[i] = 0; + } +} + +__host__ +void sm3_cuda_hash_64(int thr_id, uint32_t threads, uint32_t *g_hash, int order) +{ + const uint32_t threadsperblock = 256; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + sm3_gpu_hash_64 <<>>(threads, g_hash); + //MyStreamSynchronize(NULL, order, thr_id); +} diff --git a/x13/hsr.cu b/x13/hsr.cu new file mode 100644 index 0000000000..e86444628d --- /dev/null +++ b/x13/hsr.cu @@ -0,0 +1,265 @@ +/* + * X13 algorithm + */ +extern "C" +{ +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" + +#include "sph/sph_luffa.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_echo.h" + +#include "sph/sph_hamsi.h" +#include "sph/sph_fugue.h" +} +#include "sm3.h" + +#include "miner.h" + +#include "cuda_helper.h" +#include "x11/cuda_x11.h" + +static uint32_t *d_hash[MAX_GPUS]; + +extern void sm3_cuda_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash, int order); + +extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); +extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); +extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_free(int thr_id); + +// HSR CPU Hash +extern "C" void hsr_hash(void *output, const void *input) +{ + // blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13 + + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + sph_luffa512_context ctx_luffa; + sph_cubehash512_context ctx_cubehash; + sph_shavite512_context ctx_shavite; + sph_simd512_context ctx_simd; + sph_echo512_context ctx_echo; + sm3_ctx_t ctx_sm3; + sph_hamsi512_context ctx_hamsi; + sph_fugue512_context ctx_fugue; + + uint32_t hash[32]; + memset(hash, 0, sizeof hash); + + sph_blake512_init(&ctx_blake); + sph_blake512(&ctx_blake, input, 80); + sph_blake512_close(&ctx_blake, (void*) hash); + + sph_bmw512_init(&ctx_bmw); + sph_bmw512(&ctx_bmw, (const void*) hash, 64); + sph_bmw512_close(&ctx_bmw, (void*) hash); + + sph_groestl512_init(&ctx_groestl); + sph_groestl512(&ctx_groestl, (const void*) hash, 64); + sph_groestl512_close(&ctx_groestl, (void*) hash); + + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, (const void*) hash, 64); + sph_skein512_close(&ctx_skein, (void*) hash); + + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, (const void*) hash, 64); + sph_jh512_close(&ctx_jh, (void*) hash); + + sph_keccak512_init(&ctx_keccak); + sph_keccak512(&ctx_keccak, (const void*) hash, 64); + sph_keccak512_close(&ctx_keccak, (void*) hash); + + sph_luffa512_init(&ctx_luffa); + sph_luffa512(&ctx_luffa, (const void*) hash, 64); + sph_luffa512_close (&ctx_luffa, (void*) hash); + + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, (const void*) hash, 64); + sph_cubehash512_close(&ctx_cubehash, (void*) hash); + + sph_shavite512_init(&ctx_shavite); + sph_shavite512(&ctx_shavite, (const void*) hash, 64); + sph_shavite512_close(&ctx_shavite, (void*) hash); + + sph_simd512_init(&ctx_simd); + sph_simd512(&ctx_simd, (const void*) hash, 64); + sph_simd512_close(&ctx_simd, (void*) hash); + + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, (const void*) hash, 64); + sph_echo512_close(&ctx_echo, (void*) hash); + + sm3_init(&ctx_sm3); + sm3_update(&ctx_sm3, (const unsigned char*) hash, 64); + memset(hash, 0, sizeof hash); + sm3_close(&ctx_sm3, (void*) hash); + + sph_hamsi512_init(&ctx_hamsi); + sph_hamsi512(&ctx_hamsi, (const void*) hash, 64); + sph_hamsi512_close(&ctx_hamsi, (void*) hash); + + sph_fugue512_init(&ctx_fugue); + sph_fugue512(&ctx_fugue, (const void*) hash, 64); + sph_fugue512_close(&ctx_fugue, (void*) hash); + + memcpy(output, hash, 32); +} + +static bool init[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_hsr(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + int intensity = 19; // (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19; + uint32_t throughput = cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8; + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x000f; + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + quark_blake512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + quark_bmw512_cpu_init(thr_id, throughput); + quark_keccak512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + x11_luffaCubehash512_cpu_init(thr_id, throughput); + x11_shavite512_cpu_init(thr_id, throughput); + if (x11_simd512_cpu_init(thr_id, throughput) != 0) { + return 0; + } + x11_echo512_cpu_init(thr_id, throughput); + x13_hamsi512_cpu_init(thr_id, throughput); + x13_fugue512_cpu_init(thr_id, throughput); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0); + + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + uint32_t endiandata[20]; + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + quark_blake512_cpu_setBlock_80(thr_id, endiandata); + cuda_check_cpu_setTarget(ptarget); + + do { + int order = 0; + + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + sm3_cuda_hash_64(thr_id, throughput, d_hash[thr_id], order++); + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + + *hashes_done = pdata[19] - first_nonce + throughput; + + CUDA_LOG_ERROR(); + + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + hsr_hash(vhash, endiandata); + + if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + work_set_target_ratio(work, vhash); + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + hsr_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + pdata[19] = work->nonces[0] + 1; + continue; + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + + CUDA_LOG_ERROR(); + + return 0; +} + +// cleanup +extern "C" void free_hsr(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + + quark_blake512_cpu_free(thr_id); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + + cuda_check_cpu_free(thr_id); + CUDA_LOG_ERROR(); + + cudaDeviceSynchronize(); + init[thr_id] = false; +} diff --git a/x13/sm3.c b/x13/sm3.c new file mode 100644 index 0000000000..295ba15086 --- /dev/null +++ b/x13/sm3.c @@ -0,0 +1,220 @@ +/* ==================================================================== + * Copyright (c) 2014 - 2017 The GmSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the GmSSL Project. + * (http://gmssl.org/)" + * + * 4. The name "GmSSL Project" must not be used to endorse or promote + * products derived from this software without prior written + * permission. For written permission, please contact + * guanzhi1980@gmail.com. + * + * 5. Products derived from this software may not be called "GmSSL" + * nor may "GmSSL" appear in their names without prior written + * permission of the GmSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the GmSSL Project + * (http://gmssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#include + +#include "sm3.h" + +void sm3_init(sm3_ctx_t *ctx) +{ + ctx->digest[0] = 0x7380166F; + ctx->digest[1] = 0x4914B2B9; + ctx->digest[2] = 0x172442D7; + ctx->digest[3] = 0xDA8A0600; + ctx->digest[4] = 0xA96F30BC; + ctx->digest[5] = 0x163138AA; + ctx->digest[6] = 0xE38DEE4D; + ctx->digest[7] = 0xB0FB0E4E; + + ctx->nblocks = 0; + ctx->num = 0; +} + +void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len) +{ + if (ctx->num) { + unsigned int left = SM3_BLOCK_SIZE - ctx->num; + if (data_len < left) { + memcpy(ctx->block + ctx->num, data, data_len); + ctx->num += data_len; + return; + } else { + memcpy(ctx->block + ctx->num, data, left); + sm3_compress(ctx->digest, ctx->block); + ctx->nblocks++; + data += left; + data_len -= left; + } + } + while (data_len >= SM3_BLOCK_SIZE) { + sm3_compress(ctx->digest, data); + ctx->nblocks++; + data += SM3_BLOCK_SIZE; + data_len -= SM3_BLOCK_SIZE; + } + ctx->num = data_len; + if (data_len) { + memcpy(ctx->block, data, data_len); + } +} + +void sm3_close(void *cc, void *dst) +{ + sm3_final(cc, dst); + memset(cc, 0, sizeof(sm3_ctx_t)); +} + +void sm3_final(sm3_ctx_t *ctx, unsigned char *digest) +{ + int i; + uint32_t *pdigest = (uint32_t *)digest; + uint32_t *count = (uint32_t *)(ctx->block + SM3_BLOCK_SIZE - 8); + + ctx->block[ctx->num] = 0x80; + + if (ctx->num + 9 <= SM3_BLOCK_SIZE) { + memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 9); + } else { + memset(ctx->block + ctx->num + 1, 0, SM3_BLOCK_SIZE - ctx->num - 1); + sm3_compress(ctx->digest, ctx->block); + memset(ctx->block, 0, SM3_BLOCK_SIZE - 8); + } + + count[0] = cpu_to_be32((ctx->nblocks) >> 23); + count[1] = cpu_to_be32((ctx->nblocks << 9) + (ctx->num << 3)); + + sm3_compress(ctx->digest, ctx->block); + for (i = 0; i < sizeof(ctx->digest)/sizeof(ctx->digest[0]); i++) { + pdigest[i] = cpu_to_be32(ctx->digest[i]); + } +} + +#define ROTATELEFT(X,n) (((X)<<(n)) | ((X)>>(32-(n)))) + +#define P0(x) ((x) ^ ROTATELEFT((x),9) ^ ROTATELEFT((x),17)) +#define P1(x) ((x) ^ ROTATELEFT((x),15) ^ ROTATELEFT((x),23)) + +#define FF0(x,y,z) ( (x) ^ (y) ^ (z)) +#define FF1(x,y,z) (((x) & (y)) | ( (x) & (z)) | ( (y) & (z))) + +#define GG0(x,y,z) ( (x) ^ (y) ^ (z)) +#define GG1(x,y,z) (((x) & (y)) | ( (~(x)) & (z)) ) + + +void sm3_compress(uint32_t digest[8], const unsigned char block[64]) +{ + int j; + uint32_t W[68], W1[64]; + const uint32_t *pblock = (const uint32_t *)block; + + uint32_t A = digest[0]; + uint32_t B = digest[1]; + uint32_t C = digest[2]; + uint32_t D = digest[3]; + uint32_t E = digest[4]; + uint32_t F = digest[5]; + uint32_t G = digest[6]; + uint32_t H = digest[7]; + uint32_t SS1,SS2,TT1,TT2,T[64]; + + for (j = 0; j < 16; j++) { + W[j] = cpu_to_be32(pblock[j]); + } + for (j = 16; j < 68; j++) { + W[j] = P1( W[j-16] ^ W[j-9] ^ ROTATELEFT(W[j-3],15)) ^ ROTATELEFT(W[j - 13],7 ) ^ W[j-6];; + } + for( j = 0; j < 64; j++) { + W1[j] = W[j] ^ W[j+4]; + } + + for(j =0; j < 16; j++) { + + T[j] = 0x79CC4519; + SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7); + SS2 = SS1 ^ ROTATELEFT(A,12); + TT1 = FF0(A,B,C) + D + SS2 + W1[j]; + TT2 = GG0(E,F,G) + H + SS1 + W[j]; + D = C; + C = ROTATELEFT(B,9); + B = A; + A = TT1; + H = G; + G = ROTATELEFT(F,19); + F = E; + E = P0(TT2); + } + + for(j =16; j < 64; j++) { + + T[j] = 0x7A879D8A; + SS1 = ROTATELEFT((ROTATELEFT(A,12) + E + ROTATELEFT(T[j],j)), 7); + SS2 = SS1 ^ ROTATELEFT(A,12); + TT1 = FF1(A,B,C) + D + SS2 + W1[j]; + TT2 = GG1(E,F,G) + H + SS1 + W[j]; + D = C; + C = ROTATELEFT(B,9); + B = A; + A = TT1; + H = G; + G = ROTATELEFT(F,19); + F = E; + E = P0(TT2); + } + + digest[0] ^= A; + digest[1] ^= B; + digest[2] ^= C; + digest[3] ^= D; + digest[4] ^= E; + digest[5] ^= F; + digest[6] ^= G; + digest[7] ^= H; +} + +void sm3(const unsigned char *msg, size_t msglen, + unsigned char dgst[SM3_DIGEST_LENGTH]) +{ + sm3_ctx_t ctx; + + sm3_init(&ctx); + sm3_update(&ctx, msg, msglen); + sm3_final(&ctx, dgst); + + memset(&ctx, 0, sizeof(sm3_ctx_t)); +} diff --git a/x13/sm3.h b/x13/sm3.h new file mode 100644 index 0000000000..05c6595d98 --- /dev/null +++ b/x13/sm3.h @@ -0,0 +1,109 @@ +/* ==================================================================== + * Copyright (c) 2014 - 2016 The GmSSL Project. All rights reserved. + * Copyright (c) 2017 - YiiMP (cleaned hmac dead stuff) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the GmSSL Project. + * (http://gmssl.org/)" + * + * 4. The name "GmSSL Project" must not be used to endorse or promote + * products derived from this software without prior written + * permission. For written permission, please contact + * guanzhi1980@gmail.com. + * + * 5. Products derived from this software may not be called "GmSSL" + * nor may "GmSSL" appear in their names without prior written + * permission of the GmSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the GmSSL Project + * (http://gmssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE GmSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE GmSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +#ifndef _SM3_H +#define _SM3_H + +#define SM3_DIGEST_LENGTH 32 +#define SM3_BLOCK_SIZE 64 +#define SM3_CBLOCK (SM3_BLOCK_SIZE) +#define SM3_HMAC_SIZE (SM3_DIGEST_LENGTH) + + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +typedef struct { + uint32_t digest[8]; + int nblocks; + unsigned char block[64]; + int num; +} sm3_ctx_t; + +void sm3_init(sm3_ctx_t *ctx); +void sm3_update(sm3_ctx_t *ctx, const unsigned char* data, size_t data_len); +void sm3_close(void *cc, void *dst); + +void sm3_final(sm3_ctx_t *ctx, unsigned char digest[SM3_DIGEST_LENGTH]); +void sm3_compress(uint32_t digest[8], const unsigned char block[SM3_BLOCK_SIZE]); +void sm3(const unsigned char *data, size_t datalen, + unsigned char digest[SM3_DIGEST_LENGTH]); + +#ifdef CPU_BIGENDIAN + +#define cpu_to_be16(v) (v) +#define cpu_to_be32(v) (v) +#define be16_to_cpu(v) (v) +#define be32_to_cpu(v) (v) + +#else + +#define cpu_to_le16(v) (v) +#define cpu_to_le32(v) (v) +#define le16_to_cpu(v) (v) +#define le32_to_cpu(v) (v) + +#define cpu_to_be16(v) (((v)<< 8) | ((v)>>8)) +#define cpu_to_be32(v) (((v)>>24) | (((v)>>8)&0xff00) | (((v)<<8)&0xff0000) | ((v)<<24)) +#define be16_to_cpu(v) cpu_to_be16(v) +#define be32_to_cpu(v) cpu_to_be32(v) + +#endif + +#ifdef __cplusplus +} +#endif +#endif