Skip to content

Commit

Permalink
Adopt KlausT's Loop Unroll + Funnel Shift Changes
Browse files Browse the repository at this point in the history
~360% performance improvement from KlausT's fork (https://github.com/KlausT/PascalCoin-CUDA/commit/4598579b2e802e0b6cb469df2181ee2d295717bf). This version doesn't adopt any of his timing/calculation changes.

Have yet to mine a block using this version.
  • Loading branch information
max-sanchez committed Jan 27, 2017
1 parent 85cffab commit 96d7a63
Showing 1 changed file with 17 additions and 26 deletions.
43 changes: 17 additions & 26 deletions kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,37 +19,25 @@ uint32_t *blockHeadermobj = nullptr;
uint32_t *midStatemobj = nullptr;
uint32_t *nonceOutmobj = nullptr;

__device__ __forceinline__ uint32_t
ror(const uint32_t a, const unsigned int n)
__device__ __forceinline__ uint32_t ror(const uint32_t a, const unsigned int n)
{
#if __CUDA_ARCH__ >= 999 // Disabled
uint32_t d;
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(d) : "r"(a), "r"(a), "r"(n));
return d;
#if __CUDA_ARCH__ >= 350
return __funnelshift_r((a), (a), (n));
#else
return (a >> n) | (a << (32 - n));
#endif
}

__device__ __forceinline__ uint32_t
shr(const uint32_t a, const unsigned int n)
{
#if __CUDA_ARCH__ >= 999 // Disabled
uint32_t d;
asm("vshr.u32.u32.u32.clamp %0, %1, %2;" : "=r"(d) : "r"(a), "r"(n));
return d;
#else
return a >> n;
#endif
}

#define shr (a) >> (n);

#define ROTRIGHT(a,b) ((a >> b) | (a << (32 - b)))
#define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ ((x) >> 3))
#define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ ((x) >> 10))
#define SIG0c(x) (ror(x,7) ^ ror(x,18) ^ ((x) >> 3))
#define SIG1c(x) (ror(x,17) ^ ror(x,19) ^ ((x) >> 10))

#define blocksize 2048
#define blocksize 2048
#define npt 9

static const uint32_t k[64] = {
Expand Down Expand Up @@ -111,7 +99,8 @@ __global__ void __launch_bounds__(blocksize, 8) nonceGrindc(uint32_t *const __re
midstate[7] = midstateIn[7];

int j = 0;


#pragma unroll
for (j = 0; j < 16; j++)
{
buffer[j] = headerIn[j];
Expand Down Expand Up @@ -150,16 +139,17 @@ __global__ void __launch_bounds__(blocksize, 8) nonceGrindc(uint32_t *const __re
buffer[11] = n;
// printf("Nonce being used: %d\n" + buffer[11]);

#pragma unroll
for (j = 0; j < 16; j++)
{
block[j] = buffer[j];
}

#pragma unroll
for (j = 16; j < 64; j++)
{
block[j] = block[j - 16] + block[j - 7] + SIG1c(block[j - 2]) + SIG0c(block[j - 15]);
}

#pragma unroll
for (j = 0; j < 64; j++)
{
S1 = (ror(e, 6)) ^ (ror(e, 11)) ^ (ror(e, 25));
Expand Down Expand Up @@ -211,12 +201,14 @@ __global__ void __launch_bounds__(blocksize, 8) nonceGrindc(uint32_t *const __re
h5 = f = 0x9b05688c;
h6 = g = 0x1f83d9ab;
h7 = h = 0x5be0cd19;


#pragma unroll
for (j = 16; j < 64; j++)
{
block[j] = block[j - 16] + block[j - 7] + SIG1c(block[j - 2]) + SIG0c(block[j - 15]);
}

#pragma unroll
for (j = 0; j < 64; j++)
{
S1 = (ror(e, 6)) ^ (ror(e, 11)) ^ (ror(e, 25));
Expand Down Expand Up @@ -343,7 +335,7 @@ void getHeaderForWork(uint8_t *header)
void nonceGrindcuda(cudaStream_t cudastream, uint32_t threads, uint32_t *blockHeader, uint32_t *midState, uint32_t *nonceOut)
{
cudaError_t e = cudaGetLastError();
nonceGrindc << <128, 768, 2048, cudastream >> >(blockHeader, midState, nonceOut);
nonceGrindc << <128, 768, blocksize, cudastream >> >(blockHeader, midState, nonceOut);
e = cudaGetLastError();
if (e != cudaSuccess)
{
Expand Down Expand Up @@ -570,13 +562,12 @@ void grindNonces(uint32_t items_per_iter, int cycles_per_iter)
timestamp = ((timestamp & 0x000000FF) << 24) + ((timestamp & 0x0000FF00) << 8) + ((timestamp & 0x00FF0000) >> 8) + ((timestamp & 0xFF000000) >> 24);
printf("Found nonce: %08x T: %08x Hashrate: %.3f MH/s Total: %d\n", nonce, timestamp, (((((double)totalNonces) * 4 * 16 * 16 * 16 * 16) / (4)) / (((double)getTimeMillis() - start) / 1000)), totalNonces);

FILE* f2;

char fileName[13] = "datainXX.txt";
fileName[6] = (deviceToUse / 10) + 48;
fileName[7] = (deviceToUse % 10) + 48;
printf("Reading from %s\n", fileName);

f2 = fopen(fileName, "w");
while (f2 == NULL)
Expand Down Expand Up @@ -616,7 +607,7 @@ int main(int argc, char *argv[])

printf("Using Device: %d\n\n", deviceToUse);

unsigned int items_per_iter = 256 * 256 * 256 * 16;
unsigned int items_per_iter = 256 * 256 * 256 * 8;

unsigned int cycles_per_iter = 15;
double seconds_per_iter = 10.0;
Expand Down

0 comments on commit 96d7a63

Please sign in to comment.