cuda: get ride of cuda 9 mask warnings

nimbosa · Jan 8, 2018 · 3761774 · 3761774
1 parent f1a7de4
commit 3761774
Show file tree

Hide file tree

Showing 9 changed files with 200 additions and 150 deletions.
diff --git a/Makefile.am b/Makefile.am
@@ -111,9 +111,10 @@ endif
 #ccminer_LDADD += -lsodium
 ccminer_LDADD += -lcuda
 
-nvcc_ARCH  = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
-
+nvcc_ARCH :=
+#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
 nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
+nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
 #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
 

diff --git a/configure.ac b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.2.4], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.2.5], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM

diff --git a/cuda_helper.h b/cuda_helper.h
@@ -669,4 +669,14 @@ static uint2 SHR2(uint2 a, int offset)
 #endif
 }
 
+// CUDA 9+ deprecated functions warnings (new mask param)
+#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
+#undef __shfl
+#define __shfl(var, srcLane, width)  __shfl_sync(0xFFFFFFFFu, var, srcLane, width)
+#undef __shfl_up
+#define __shfl_up(var, delta, width) __shfl_up_sync(0xFFFFFFFF, var, delta, width)
+#undef __any
+#define __any(p) __any_sync(0xFFFFFFFFu, p)
+#endif
+
 #endif // #ifndef CUDA_HELPER_H
diff --git a/equi/cuda_equi.cu b/equi/cuda_equi.cu
@@ -65,7 +65,8 @@
 #define __CUDA_ARCH__ 520
 uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
 uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z);
-uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z);
+uint32_t __shfl2(uint32_t x, uint32_t y);
+uint32_t __shfl_sync(uint32_t mask, uint32_t x, uint32_t y);
 uint32_t atomicExch(uint32_t *x, uint32_t y);
 uint32_t atomicAdd(uint32_t *x, uint32_t y);
 void __syncthreads(void);
@@ -79,6 +80,14 @@ u32 umin(const u32, const u32);
 u32 umax(const u32, const u32);
 #endif
 
+#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
+#define __shfl2(var, srcLane)  __shfl_sync(0xFFFFFFFFu, var, srcLane)
+#undef __any
+#define __any(p) __any_sync(0xFFFFFFFFu, p)
+#else
+#define __shfl2 __shfl
+#endif
+
 typedef u32 proof[PROOFSIZE];
 
 struct __align__(32) slot {
@@ -1844,7 +1853,7 @@ __global__ void digit_last_wdc(equi<RB, SM>* eq)
 		}
 #if __CUDA_ARCH__ >= 300
 		// all threads get the value from lane 0
-		soli = __shfl(soli, 0);
+		soli = __shfl2(soli, 0);
 #else
 		__syncthreads();
 		soli = eq->edata.srealcont.nsols;

diff --git a/lyra2/cuda_lyra2_vectors.h b/lyra2/cuda_lyra2_vectors.h
@@ -16,6 +16,12 @@
 #define __shfl(x, y, z) (x)
 #endif
 
+#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
+#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane)
+#else
+#define __shfl2 __shfl
+#endif
+
 #if __CUDA_ARCH__ < 320 && !defined(__ldg4)
 #define __ldg4(x) (*(x))
 #endif
@@ -89,7 +95,7 @@ typedef struct __align__(16) uint28 {
 typedef uint2x4 uint28; /* name deprecated */
 
 typedef struct __builtin_align__(32) uint48 {
-		uint4 s0,s1;
+	uint4 s0,s1;
 } uint48;
 
 typedef struct __builtin_align__(128) uint4x16{
@@ -368,10 +374,10 @@ static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulong
 
 static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; }
 static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; }
-static __forceinline__ __device__  __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
-static __forceinline__ __device__  __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
-static __forceinline__ __device__   void operator+= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
-static __forceinline__ __device__   void operator^= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
+static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
+static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator+= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
+static __forceinline__ __device__ void operator^= (uint2_16 &a, const uint2_16 &b) { a = a + b; }
 
 static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) { a = a + b; }
 static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) { a = a + b; }
@@ -551,14 +557,14 @@ static __device__ __forceinline__ uint28 shuffle4(const uint28 &var, int lane)
 {
 #if __CUDA_ARCH__ >= 300
 	uint28 res;
-	res.x.x = __shfl(var.x.x, lane);
-	res.x.y = __shfl(var.x.y, lane);
-	res.y.x = __shfl(var.y.x, lane);
-	res.y.y = __shfl(var.y.y, lane);
-	res.z.x = __shfl(var.z.x, lane);
-	res.z.y = __shfl(var.z.y, lane);
-	res.w.x = __shfl(var.w.x, lane);
-	res.w.y = __shfl(var.w.y, lane);
+	res.x.x = __shfl2(var.x.x, lane);
+	res.x.y = __shfl2(var.x.y, lane);
+	res.y.x = __shfl2(var.y.x, lane);
+	res.y.y = __shfl2(var.y.y, lane);
+	res.z.x = __shfl2(var.z.x, lane);
+	res.z.y = __shfl2(var.z.y, lane);
+	res.w.x = __shfl2(var.w.x, lane);
+	res.w.y = __shfl2(var.w.y, lane);
 	return res;
 #else
 	return var;
@@ -569,22 +575,22 @@ static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane)
 {
 #if __CUDA_ARCH__ >= 300
 	ulonglong4 res;
-    uint2 temp;
+	uint2 temp;
 	temp = vectorize(var.x);
-	temp.x = __shfl(temp.x, lane);
-	temp.y = __shfl(temp.y, lane);
+	temp.x = __shfl2(temp.x, lane);
+	temp.y = __shfl2(temp.y, lane);
 	res.x = devectorize(temp);
 	temp = vectorize(var.y);
-	temp.x = __shfl(temp.x, lane);
-	temp.y = __shfl(temp.y, lane);
+	temp.x = __shfl2(temp.x, lane);
+	temp.y = __shfl2(temp.y, lane);
 	res.y = devectorize(temp);
 	temp = vectorize(var.z);
-	temp.x = __shfl(temp.x, lane);
-	temp.y = __shfl(temp.y, lane);
+	temp.x = __shfl2(temp.x, lane);
+	temp.y = __shfl2(temp.y, lane);
 	res.z = devectorize(temp);
 	temp = vectorize(var.w);
-	temp.x = __shfl(temp.x, lane);
-	temp.y = __shfl(temp.y, lane);
+	temp.x = __shfl2(temp.x, lane);
+	temp.y = __shfl2(temp.y, lane);
 	res.w = devectorize(temp);
 	return res;
 #else

diff --git a/scrypt/kepler_kernel.cu b/scrypt/kepler_kernel.cu
@@ -10,6 +10,8 @@
 #include <map>
 
 #include <cuda_runtime.h>
+#include <cuda_helper.h>
+
 #include "miner.h"
 
 #include "salsa_kernel.h"
@@ -18,6 +20,12 @@
 #define TEXWIDTH 32768
 #define THREADS_PER_WU 4  // four threads per hash
 
+#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
+#define __shfl2(var, srcLane)  __shfl_sync(0xFFFFFFFFu, var, srcLane)
+#else
+#define __shfl2 __shfl
+#endif
+
 typedef enum
 {
 		ANDERSEN,
@@ -57,12 +65,12 @@ static __host__ __device__ uint4& operator += (uint4& left, const uint4& right)
 	return left;
 }
 
-static __device__ uint4 __shfl(const uint4 bx, int target_thread) {
+static __device__ uint4 shfl4(const uint4 bx, int target_thread) {
 	return make_uint4(
-		__shfl((int)bx.x, target_thread),
-		__shfl((int)bx.y, target_thread),
-		__shfl((int)bx.z, target_thread),
-		__shfl((int)bx.w, target_thread)
+		__shfl2((int)bx.x, target_thread),
+		__shfl2((int)bx.y, target_thread),
+		__shfl2((int)bx.z, target_thread),
+		__shfl2((int)bx.w, target_thread)
 	);
 }
 
@@ -97,8 +105,8 @@ void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start)
 
 	if (SCHEME == ANDERSEN) {
 		int target_thread = (threadIdx.x + 4)%32;
-		uint4 t=b, t2=__shfl(bx, target_thread);
-		int t2_start = __shfl((int)start, target_thread) + 4;
+		uint4 t = b, t2 = shfl4(bx, target_thread);
+		int t2_start = __shfl2((int)start, target_thread) + 4;
 		bool c = (threadIdx.x & 0x4);
 		*((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t);
 		*((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2);
@@ -115,7 +123,7 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
 
 	if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32];
 	if (SCHEME == ANDERSEN) {
-		int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4;
+		int t2_start = __shfl2((int)start, (threadIdx.x + 4)%32) + 4;
 		if (TEX_DIM > 0) { start /= 4; t2_start /= 4; }
 		bool c = (threadIdx.x & 0x4);
 		if (TEX_DIM == 0) {
@@ -129,7 +137,7 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start)
 				bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH));
 		}
 		uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx);
-		bx = __shfl(bx, (threadIdx.x + 28)%32);
+		bx = shfl4(bx, (threadIdx.x + 28)%32);
 	} else {
 				 if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start]));
 		else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4);
@@ -149,14 +157,15 @@ void primary_order_shuffle(uint4 &b, uint4 &bx)
 	int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3);
 	int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3);
 
-	b.w = __shfl((int)b.w, x1);
-	b.z = __shfl((int)b.z, x2);
-	b.y = __shfl((int)b.y, x3);
+	b.w = __shfl2((int)b.w, x1);
+	b.z = __shfl2((int)b.z, x2);
+	b.y = __shfl2((int)b.y, x3);
+
 	uint32_t tmp = b.y; b.y = b.w; b.w = tmp;
 
-	bx.w = __shfl((int)bx.w, x1);
-	bx.z = __shfl((int)bx.z, x2);
-	bx.y = __shfl((int)bx.y, x3);
+	bx.w = __shfl2((int)bx.w, x1);
+	bx.z = __shfl2((int)bx.z, x2);
+	bx.y = __shfl2((int)bx.y, x3);
 	tmp = bx.y; bx.y = bx.w; bx.w = tmp;
 }
 
@@ -318,9 +327,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
 		/* Unclear if this optimization is needed: These are ordered based
 		 * upon the dependencies needed in the later xors. Compiler should be
 		 * able to figure this out, but might as well give it a hand. */
-		x.y = __shfl((int)x.y, x3);
-		x.w = __shfl((int)x.w, x1);
-		x.z = __shfl((int)x.z, x2);
+		x.y = __shfl2((int)x.y, x3);
+		x.w = __shfl2((int)x.w, x1);
+		x.z = __shfl2((int)x.z, x2);
 
 		/* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first,
 		 * but the register targets are rewritten here to swap x[1] and x[3] so that
@@ -333,9 +342,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
 		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
 		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
 
-		x.w = __shfl((int)x.w, x3);
-		x.y = __shfl((int)x.y, x1);
-		x.z = __shfl((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x3);
+		x.y = __shfl2((int)x.y, x1);
+		x.z = __shfl2((int)x.z, x2);
 	}
 
 	b += x;
@@ -352,18 +361,18 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x
 		XOR_ROTATE_ADD(x.w, x.z, x.y, 13);
 		XOR_ROTATE_ADD(x.x, x.w, x.z, 18);
 
-		x.y = __shfl((int)x.y, x3);
-		x.w = __shfl((int)x.w, x1);
-		x.z = __shfl((int)x.z, x2);
+		x.y = __shfl2((int)x.y, x3);
+		x.w = __shfl2((int)x.w, x1);
+		x.z = __shfl2((int)x.z, x2);
 
 		XOR_ROTATE_ADD(x.w, x.x, x.y, 7);
 		XOR_ROTATE_ADD(x.z, x.w, x.x, 9);
 		XOR_ROTATE_ADD(x.y, x.z, x.w, 13);
 		XOR_ROTATE_ADD(x.x, x.y, x.z, 18);
 
-		x.w = __shfl((int)x.w, x3);
-		x.y = __shfl((int)x.y, x1);
-		x.z = __shfl((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x3);
+		x.y = __shfl2((int)x.y, x1);
+		x.z = __shfl2((int)x.z, x2);
 	}
 
 	// At the end of these iterations, the data is in primary order again.
@@ -407,19 +416,19 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
 		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
 		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
 
-		x.y = __shfl((int)x.y, x1);
-		x.z = __shfl((int)x.z, x2);
-		x.w = __shfl((int)x.w, x3);
+		x.y = __shfl2((int)x.y, x1);
+		x.z = __shfl2((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x3);
 
 		// Diagonal Mixing phase of chacha
 		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
 		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
 		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
 		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
 
-		x.y = __shfl((int)x.y, x3);
-		x.z = __shfl((int)x.z, x2);
-		x.w = __shfl((int)x.w, x1);
+		x.y = __shfl2((int)x.y, x3);
+		x.z = __shfl2((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x1);
 	}
 
 	b += x;
@@ -436,19 +445,19 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int
 		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
 		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
 
-		x.y = __shfl((int)x.y, x1);
-		x.z = __shfl((int)x.z, x2);
-		x.w = __shfl((int)x.w, x3);
+		x.y = __shfl2((int)x.y, x1);
+		x.z = __shfl2((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x3);
 
 		// Diagonal Mixing phase of chacha
 		CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16)
 		CHACHA_PRIMITIVE(x.z ,x.y, x.w, 12)
 		CHACHA_PRIMITIVE(x.x ,x.w, x.y,  8)
 		CHACHA_PRIMITIVE(x.z ,x.y, x.w,  7)
 
-		x.y = __shfl((int)x.y, x3);
-		x.z = __shfl((int)x.z, x2);
-		x.w = __shfl((int)x.w, x1);
+		x.y = __shfl2((int)x.y, x3);
+		x.z = __shfl2((int)x.z, x2);
+		x.w = __shfl2((int)x.w, x1);
 	}
 
 #undef CHACHA_PRIMITIVE
@@ -572,7 +581,7 @@ void kepler_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end)
 	} else load_key<ALGO>(d_odata, b, bx);
 
 	for (int i = begin; i < end; i++) {
-		int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+		int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
 		uint4 t, tx; read_keys_direct<SCHEME, TEX_DIM>(t, tx, start+32*j);
 		b ^= t; bx ^= tx;
 		block_mixer<ALGO>(b, bx, x1, x2, x3);
@@ -604,15 +613,15 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign
 	{
 		// better divergent thread handling submitted by nVidia engineers, but
 		// supposedly this does not run with the ANDERSEN memory access scheme
-		int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+		int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
 		int pos = j/LOOKUP_GAP;
 		int loop = -1;
 		uint4 t, tx;
 
 		int i = begin;
 		while(i < end) {
 			if (loop==-1) {
-				j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+				j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
 				pos = j/LOOKUP_GAP;
 				loop = j-pos*LOOKUP_GAP;
 				read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
@@ -634,7 +643,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign
 		// this is my original implementation, now used with the ANDERSEN
 		// memory access scheme only.
 		for (int i = begin; i < end; i++) {
-			int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+			int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
 			int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
 			uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
 			while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);
@@ -644,7 +653,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign
 	}
 
 //for (int i = begin; i < end; i++) {
-//	int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
+//	int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1));
 //	int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP;
 //	uint4 t, tx; read_keys_direct<SCHEME,TEX_DIM>(t, tx, start+32*pos);
 //	while(loop--) block_mixer<ALGO>(t, tx, x1, x2, x3);