2018-01-25 UPSTREAM: harmonize conflicts to merge 2.2.4

Merge upstream updates up to Jan 10, 2018 - version 2.2.4; commit 1b4512e Merge: d2b6e6c a847808 Author: nimbosa <darth.Serious@gmail.com> Date: Thu Jan 25 11:35:27 2018 +0800 Merge branch 'windows' of TPruvot/ccminer; commit a847808 Merge: 2b081c0 50781f0 Author: nimbosa <darth.Serious@gmail.com> Date: Thu Jan 25 11:08:34 2018 +0800 # Conflicts: # Algo256/cuda_blake256.cu # Algo256/cuda_cubehash256.cu # ccminer.cpp # lyra2/cuda_lyra2.cu # lyra2/cuda_lyra2Z.cu # lyra2/lyra2REv2.cu 2017-11-01 MOD: added newer nvcc ARCH 61, fixed missing ("). commit d2b6e6c Author: nimbosa <darth.Serious@gmail.com> Date: Thu Jan 25 09:34:19 2018 +0800 Lyra2: Translate japanese comments into english (tpruvot#40); commit 50781f0 Author: hakemimi <34901335+hakemimi@users.noreply.github.com> Date: Wed Jan 10 20:24:50 2018 +0900 * Revert "lyra2: fix compilation on japanese windows (tpruvot#38)" This reverts commit 30db7d2. * Restore comments removed in tpruvot#38, translate them into English The original line says "allocate XXXX bytes to adjust for X Warp." Remove the number from comment since they are trivial * Translate Japanese comments commit 30db7d2 Author: Guillaume George <lysandergc@gmail.com> Date: Wed Jan 10 02:32:12 2018 +0900 lyra2: fix compilation on japanese windows (tpruvot#38) Remove comment that prevent compilation on japanese windows (tpruvot#38) commit 3761774 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Mon Jan 8 13:50:20 2018 +0100 cuda: get ride of cuda 9 mask warnings commit f1a7de4 Author: Chris Spillane <chris.spillane@tuta.io> Date: Fri Jan 5 10:56:54 2018 +0000 Update README.txt (tpruvot#37) remove duplicate x17 line, add a few more coin examples. commit b54be47 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Thu Jan 4 17:51:49 2018 +0100 upgrade BIGNUM class for openssl 1.1 commit 69e1296 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Thu Jan 4 16:06:01 2018 +0100 update README for 2.2.4 commit 73dd6aa Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Thu Jan 4 15:46:39 2018 +0100 keccak: avoid to use twice cuda_default_throughput and drop useless gpu hash alloc... commit 11a512f Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Sat Aug 5 04:16:28 2017 +0200 change defaults to handle cuda 9+, disable heavy and SM 2.x Heavy is the only algo using thrust which is generally broken on new cuda releases mjollnir dropped too... never seen this coin anyway... commit 91af8ab Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Thu Jan 4 15:19:20 2018 +0100 decred doesnt allow pow votes commit 6165562 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Fri Dec 15 01:53:23 2017 +0100 equihash: fix for cuda 9.1 build commit 72466ee Author: dementeb <dementeb@users.noreply.github.com> Date: Thu Jan 4 16:56:25 2018 +0300 api: update websocket sample (tpruvot#31) commit b70409a Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Thu Dec 14 18:12:01 2017 +0100 lyra2RE: link the merged blake/keccak kernel into algos old keccak256_gpu_hash_32 kernel commented to reduce binary size compat. not yet tested on old cards commit 18d2991 Author: Myrinia <patrick.braunstorfer@gmail.com> Date: Thu Dec 14 16:19:45 2017 +0100 Improve Lyra2RE2 Performance Improved Lyra2Re2 Performance by 1 % commit 6c0e656 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Sat Dec 9 16:54:35 2017 +0100 keccak: fix issue with intensity commit 015d129 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Mon Dec 4 21:47:31 2017 +0100 keccak second nonce, and higher intensity commit cf886b5 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Mon Dec 4 15:58:07 2017 +0100 import and adapt alexis optimised keccak256 for SM 5+ and increase default intensity for these recent cards commit d19e2a1 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Sun Dec 3 16:49:23 2017 +0100 Add keccakc for creative coin Same hash as keccak, just different pool settings (sha256d and diff factor 256) commit df4fcbe Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Mon Nov 20 08:50:32 2017 +0100 jha: remove deprecated warning commit bfa616a Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Mon Nov 20 07:47:11 2017 +0100 linux: common openssl lib function for 1.1+ commit dfff3b3 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Mon Nov 20 07:30:04 2017 +0100 windows: high precision timer before threads start commit 2e0a977 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Thu Nov 16 09:21:38 2017 +0100 polytimos algo (6 chained algos with streebog) commit 2b081c0 Merge: 8affcb9 6c7fa7c Author: nimbosa <darth.Serious@gmail.com> Date: Wed Nov 1 19:35:34 2017 +0800 2017-10-10--UPSTREAM: merge changes as of Oct 10, 2017 commit 6c7fa7c Merge: e1575c5 8affcb9 Author: nimbosa <darth.Serious@gmail.com> Date: Wed Nov 1 19:32:42 2017 +0800 2017-10-10--UPSTREAM: merge changes as of Oct 10, 2017 commit e1575c5 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Mon Oct 9 19:49:05 2017 +0200 update README for v2.2.2 commit ed27598 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Mon Oct 9 19:15:46 2017 +0200 import and clean hsr algo commit cf18cb6 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Mon Oct 9 14:39:00 2017 +0200 equihash: missing free on benchs commit e505d10 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Mon Oct 9 14:00:15 2017 +0200 new --submit-stale parameter (tpruvot#24) commit 5a90db1 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Mon Oct 9 13:26:29 2017 +0200 phi: maxwell opt (aes final + streebog) + fix the fugue leak.. Also update sib algo with this improvement commit 3dbcc5d Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Sat Oct 7 10:34:27 2017 +0200 Import phi algo (by anorganix) only made a few changes to keep algos in the alpha order commit 575aa1a Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Thu Oct 5 13:17:31 2017 +0200 ccminer: argument fix for 10th device selection commit 4d09d85 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Wed Sep 20 05:08:23 2017 +0200 readme: move ubuntu stuff in INSTALL file, no ads commit e3e8481 Author: cedric walter <cedric.walter@gmail.com> Date: Tue Sep 19 09:41:58 2017 +0200 readme: add howto for ubuntu (tpruvot#23) tested on 17.10 commit ebf055d Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Fri Sep 1 20:33:50 2017 +0200 update c11 like tribus + 2.2.1 readme commit d47dd9d Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Fri Sep 1 17:03:09 2017 +0200 tribus: optimised version for recent cards main improvement is on echo, based on alexis sib kernel work tested on SM 3.0 and more recent commit 949061d Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Sun Aug 27 19:17:42 2017 +0200 xmr: disable huge pages if not defined tpruvot#19 commit 8affcb9 Author: Nanashi-Meiyo-Meijin <31034492+Nanashi-Meiyo-Meijin@users.noreply.github.com> Date: Fri Aug 18 11:43:32 2017 +0900 v2.2-mod-r2 release commit 1e71dc5 Author: Tanguy Pruvot <tanguy.pruvot@gmail.com> Date: Wed Aug 16 13:42:21 2017 +0200 streebog: apply skunk improvements to veltor
nimbosa · Jan 25, 2018 · 26bceef · 26bceef
1 parent c9fb9ac
commit 26bceef
Show file tree

Hide file tree

Showing 58 changed files with 4,390 additions and 1,125 deletions.
diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu
@@ -1,8 +1,10 @@
 /**
-* Blake-256 Cuda Kernel (Tested on SM 5.0)
-*
-* Tanguy Pruvot - Nov. 2014
-*/
+ * Blake-256 Cuda Kernel (Tested on SM 5.0)
+ *
+ * Tanguy Pruvot - Nov. 2014
+ *
+ * + merged blake+keccak kernel for lyra2v2
+ */
 extern "C" {
 #include "sph/sph_blake.h"
 }
@@ -14,20 +16,17 @@ extern "C" {
 #ifdef __INTELLISENSE__
 /* just for vstudio code colors */
 __device__ uint32_t __byte_perm(uint32_t a, uint32_t b, uint32_t c);
-
 #endif
+
 #define UINT2(x,y) make_uint2(x,y)
 
-__device__ __inline__ uint2 ROR8(const uint2 a)
-{
+__device__ __inline__ uint2 ROR8(const uint2 a) {
 	uint2 result;
 	result.x = __byte_perm(a.y, a.x, 0x0765);
 	result.y = __byte_perm(a.x, a.y, 0x0765);
-
 	return result;
 }
 
-
 static __device__ uint64_t cuda_swab32ll(uint64_t x) {
 	return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
 }
@@ -76,18 +75,18 @@ static const uint32_t  c_u256[16] = {
 };
 
 __constant__ uint2 keccak_round_constants35[24] = {
-	{ 0x00000001ul, 0x00000000 },{ 0x00008082ul, 0x00000000 },
-	{ 0x0000808aul, 0x80000000 },{ 0x80008000ul, 0x80000000 },
-	{ 0x0000808bul, 0x00000000 },{ 0x80000001ul, 0x00000000 },
-	{ 0x80008081ul, 0x80000000 },{ 0x00008009ul, 0x80000000 },
-	{ 0x0000008aul, 0x00000000 },{ 0x00000088ul, 0x00000000 },
-	{ 0x80008009ul, 0x00000000 },{ 0x8000000aul, 0x00000000 },
-	{ 0x8000808bul, 0x00000000 },{ 0x0000008bul, 0x80000000 },
-	{ 0x00008089ul, 0x80000000 },{ 0x00008003ul, 0x80000000 },
-	{ 0x00008002ul, 0x80000000 },{ 0x00000080ul, 0x80000000 },
-	{ 0x0000800aul, 0x00000000 },{ 0x8000000aul, 0x80000000 },
-	{ 0x80008081ul, 0x80000000 },{ 0x00008080ul, 0x80000000 },
-	{ 0x80000001ul, 0x00000000 },{ 0x80008008ul, 0x80000000 }
+	{ 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 },
+	{ 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 },
+	{ 0x0000808bul, 0x00000000 }, { 0x80000001ul, 0x00000000 },
+	{ 0x80008081ul, 0x80000000 }, { 0x00008009ul, 0x80000000 },
+	{ 0x0000008aul, 0x00000000 }, { 0x00000088ul, 0x00000000 },
+	{ 0x80008009ul, 0x00000000 }, { 0x8000000aul, 0x00000000 },
+	{ 0x8000808bul, 0x00000000 }, { 0x0000008bul, 0x80000000 },
+	{ 0x00008089ul, 0x80000000 }, { 0x00008003ul, 0x80000000 },
+	{ 0x00008002ul, 0x80000000 }, { 0x00000080ul, 0x80000000 },
+	{ 0x0000800aul, 0x00000000 }, { 0x8000000aul, 0x80000000 },
+	{ 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 },
+	{ 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 }
 };
 
 
@@ -193,12 +192,12 @@ static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint3
 	m[2] = block[2];
 	m[3] = block[3];
 
-#pragma unroll
+	#pragma unroll
 	for (int i = 4; i < 16; i++) {
 		m[i] = c_Padding[i];
 	}
 
-#pragma unroll 8
+	#pragma unroll 8
 	for (int i = 0; i < 8; i++)
 		v[i] = h[i];
 
@@ -212,7 +211,7 @@ static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint3
 	v[14] = u256[6];
 	v[15] = u256[7];
 
-#pragma unroll 14
+	#pragma unroll 14
 	for (int r = 0; r < 14; r++) {
 		/* column step */
 		GS2(0, 4, 0x8, 0xC, 0x0);
@@ -226,7 +225,7 @@ static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint3
 		GS2(3, 4, 0x9, 0xE, 0xE);
 	}
 
-#pragma unroll 16
+	#pragma unroll 16
 	for (int i = 0; i < 16; i++) {
 		int j = i & 7;
 		h[j] ^= v[i];
@@ -238,10 +237,10 @@ static void __forceinline__ __device__ keccak_block(uint2 *s)
 	uint2 bc[5], tmpxor[5], u, v;
 	//	uint2 s[25];
 
-#pragma unroll 1
+	#pragma unroll 1
 	for (int i = 0; i < 24; i++)
 	{
-#pragma unroll
+		#pragma unroll
 		for (uint32_t x = 0; x < 5; x++)
 			tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
 
@@ -297,10 +296,10 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
 	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
-		uint32_t h[8];
-		//		uint32_t input[4];
 		const uint32_t T0 = 640;
-#pragma unroll 8
+
+		uint32_t h[8];
+		#pragma unroll 8
 		for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; }
 
 		uint32_t v[16];
@@ -311,8 +310,7 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
 			0, 1, 0, 640
 		};
 
-		const uint32_t  u256[16] =
-		{
+		const uint32_t  u256[16] = {
 			0x243F6A88, 0x85A308D3,
 			0x13198A2E, 0x03707344,
 			0xA4093822, 0x299F31D0,
@@ -323,15 +321,14 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
 			0x3F84D5B5, 0xB5470917
 		};
 
-		uint32_t m[16] =
-		{
+		uint32_t m[16] = {
 			c_data[0], c_data[1], c_data[2], nonce,
 			c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3],
 			c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7],
 			c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11]
 		};
 
-#pragma unroll 8
+		#pragma unroll 8
 		for (int i = 0; i < 8; i++)
 			v[i] = h[i];
 
@@ -380,7 +377,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
 		GSPREC(1, 6, 0xB, 0xC, 5, 10);
 		GSPREC(2, 7, 0x8, 0xD, 4, 0);
 		GSPREC(3, 4, 0x9, 0xE, 15, 8);
-
 		//	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
 		GSPREC(0, 4, 0x8, 0xC, 9, 0);
 		GSPREC(1, 5, 0x9, 0xD, 5, 7);
@@ -399,7 +395,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
 		GSPREC(1, 6, 0xB, 0xC, 7, 5);
 		GSPREC(2, 7, 0x8, 0xD, 15, 14);
 		GSPREC(3, 4, 0x9, 0xE, 1, 9);
-
 		//	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
 		GSPREC(0, 4, 0x8, 0xC, 12, 5);
 		GSPREC(1, 5, 0x9, 0xD, 1, 15);
@@ -409,7 +404,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
 		GSPREC(1, 6, 0xB, 0xC, 6, 3);
 		GSPREC(2, 7, 0x8, 0xD, 9, 2);
 		GSPREC(3, 4, 0x9, 0xE, 8, 11);
-
 		//	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
 		GSPREC(0, 4, 0x8, 0xC, 13, 11);
 		GSPREC(1, 5, 0x9, 0xD, 7, 14);
@@ -446,7 +440,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
 		GSPREC(1, 6, 0xB, 0xC, 10, 11);
 		GSPREC(2, 7, 0x8, 0xD, 12, 13);
 		GSPREC(3, 4, 0x9, 0xE, 14, 15);
-
 		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
 		GSPREC(0, 4, 0x8, 0xC, 14, 10);
 		GSPREC(1, 5, 0x9, 0xD, 4, 8);
@@ -456,7 +449,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
 		GSPREC(1, 6, 0xB, 0xC, 0, 2);
 		GSPREC(2, 7, 0x8, 0xD, 11, 7);
 		GSPREC(3, 4, 0x9, 0xE, 5, 3);
-
 		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
 		GSPREC(0, 4, 0x8, 0xC, 11, 8);
 		GSPREC(1, 5, 0x9, 0xD, 12, 0);
@@ -476,9 +468,6 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
 		GSPREC(2, 7, 0x8, 0xD, 4, 0);
 		GSPREC(3, 4, 0x9, 0xE, 15, 8);
 
-
-
-
 		h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]);
 		h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]);
 		h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]);
@@ -501,14 +490,12 @@ void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonc
 
 		keccak_gpu_state[16] = UINT2(0, 0x80000000);
 		keccak_block(keccak_gpu_state);
+
 		uint64_t *outputHash = (uint64_t *)Hash;
-#pragma unroll 4
+		#pragma unroll 4
 		for (int i = 0; i<4; i++)
 			outputHash[i*threads + thread] = devectorize(keccak_gpu_state[i]);
 	}
-
-
-
 }
 
 __global__ __launch_bounds__(256, 3)
@@ -520,16 +507,16 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
 		uint32_t h[8];
 		uint32_t input[4];
 
-#pragma unroll
+		#pragma unroll
 		for (int i = 0; i < 8; i++) h[i] = cpu_h[i];
 
-#pragma unroll
+		#pragma unroll
 		for (int i = 0; i < 3; ++i) input[i] = c_data[i];
 
 		input[3] = startNonce + thread;
 		blake256_compress2nd(h, input, 640);
 
-#pragma unroll
+		#pragma unroll
 		for (int i = 0; i<4; i++) {
 			Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2 * i + 1]));
 		}
@@ -568,6 +555,8 @@ void blake256_cpu_init(int thr_id, uint32_t threads)
 	cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);
 }
 
+/** for lyra2v2 **/
+
 __host__
 void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
 {
@@ -588,4 +577,4 @@ void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const
 	dim3 block(threadsperblock);
 
 	blakeKeccak256_gpu_hash_80 << <grid, block, 0, stream >> > (threads, startNonce, (uint32_t *)Hash);
-}
+}
diff --git a/Algo256/cuda_cubehash256.cu b/Algo256/cuda_cubehash256.cu
@@ -267,9 +267,9 @@ void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
 }
 
 #if __CUDA_ARCH__ >= 500
-__global__	__launch_bounds__(TPB50, 1)
+__global__    __launch_bounds__(TPB50, 1)
 #else
-__global__	__launch_bounds__(TPB35, 1)
+__global__    __launch_bounds__(TPB35, 1)
 #endif
 void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
 {
@@ -356,6 +356,7 @@ void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce,
 
 	cubehash256_gpu_hash_32 << <grid, block >> > (threads, startNounce, (uint2*)d_hash);
 }
+
 __host__
 void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order, cudaStream_t stream)
 {