Skip to content

Commit

Permalink
phi: maxwell opt (aes final + streebog)
Browse files Browse the repository at this point in the history
+ fix the fugue leak..

Also update sib algo with this improvement
  • Loading branch information
tpruvot committed Oct 9, 2017
1 parent 3dbcc5d commit 5a90db1
Show file tree
Hide file tree
Showing 9 changed files with 367 additions and 19 deletions.
3 changes: 2 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
x11/c11.cu x11/phi.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
x11/phi.cu x11/cuda_streebog_maxwell.cu \
x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu

# scrypt
ccminer_SOURCES += scrypt.cpp scrypt-jane.cpp \
Expand Down
3 changes: 2 additions & 1 deletion ccminer.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,6 @@
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
<FastMath>true</FastMath>
</CudaCompile>
<CudaCompile Include="x11\phi.cu" />
<CudaCompile Include="quark\cuda_bmw512.cu">
<MaxRegCount>128</MaxRegCount>
</CudaCompile>
Expand Down Expand Up @@ -560,8 +559,10 @@
<MaxRegCount>64</MaxRegCount>
</CudaCompile>
<CudaCompile Include="x11\cuda_streebog.cu" />
<CudaCompile Include="x11\cuda_streebog_maxwell.cu" />
<CudaCompile Include="x11\c11.cu" />
<CudaCompile Include="x11\fresh.cu" />
<CudaCompile Include="x11\phi.cu" />
<CudaCompile Include="x11\sib.cu" />
<CudaCompile Include="x11\s3.cu" />
<CudaCompile Include="x11\timetravel.cu" />
Expand Down
3 changes: 3 additions & 0 deletions ccminer.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,9 @@
<CudaCompile Include="x11\cuda_streebog.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\cuda_streebog_maxwell.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="x11\s3.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
Expand Down
8 changes: 4 additions & 4 deletions res/ccminer.rc
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico"
//

VS_VERSION_INFO VERSIONINFO
FILEVERSION 2,2,1,0
PRODUCTVERSION 2,2,1,0
FILEVERSION 2,2,2,0
PRODUCTVERSION 2,2,2,0
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x21L
Expand All @@ -76,10 +76,10 @@ BEGIN
BEGIN
BLOCK "040904e4"
BEGIN
VALUE "FileVersion", "2.2.1"
VALUE "FileVersion", "2.2.2"
VALUE "LegalCopyright", "Copyright (C) 2017"
VALUE "ProductName", "ccminer"
VALUE "ProductVersion", "2.2.1"
VALUE "ProductVersion", "2.2.2"
END
END
BLOCK "VarFileInfo"
Expand Down
2 changes: 1 addition & 1 deletion skunk/cuda_skunk_streebog.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include <cuda_vectors.h>
#include <cuda_vector_uint2x4.h>

#include "skunk/streebog_arrays.cuh"
#include "x11/streebog_arrays.cuh"

//#define FULL_UNROLL
__device__ __forceinline__
Expand Down
309 changes: 309 additions & 0 deletions x11/cuda_streebog_maxwell.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,309 @@
/*
* Streebog GOST R 34.10-2012 CUDA implementation.
*
* https://tools.ietf.org/html/rfc6986
* https://en.wikipedia.org/wiki/Streebog
*
* ==========================(LICENSE BEGIN)============================
*
* @author Tanguy Pruvot - 2015
* @author Alexis Provos - 2016
*/

// Further improved with shared memory partial utilization
// Tested under CUDA7.5 toolkit for cp 5.0/5.2

//#include <miner.h>
#include <cuda_helper.h>
#include <cuda_vectors.h>
#include <cuda_vector_uint2x4.h>

#include "streebog_arrays.cuh"

//#define FULL_UNROLL
__device__ __forceinline__
static void GOST_FS(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state)
{
return_state[0] = __ldg(&T02[__byte_perm(state[7].x,0,0x44440)])
^ shared[1][__byte_perm(state[6].x,0,0x44440)]
^ shared[2][__byte_perm(state[5].x,0,0x44440)]
^ shared[3][__byte_perm(state[4].x,0,0x44440)]
^ shared[4][__byte_perm(state[3].x,0,0x44440)]
^ shared[5][__byte_perm(state[2].x,0,0x44440)]
^ shared[6][__byte_perm(state[1].x,0,0x44440)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]);

return_state[1] = __ldg(&T02[__byte_perm(state[7].x,0,0x44441)])
^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)])
^ shared[2][__byte_perm(state[5].x,0,0x44441)]
^ shared[3][__byte_perm(state[4].x,0,0x44441)]
^ shared[4][__byte_perm(state[3].x,0,0x44441)]
^ shared[5][__byte_perm(state[2].x,0,0x44441)]
^ shared[6][__byte_perm(state[1].x,0,0x44441)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)]);

return_state[2] = __ldg(&T02[__byte_perm(state[7].x,0,0x44442)])
^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)])
^ shared[2][__byte_perm(state[5].x,0,0x44442)]
^ shared[3][__byte_perm(state[4].x,0,0x44442)]
^ shared[4][__byte_perm(state[3].x,0,0x44442)]
^ shared[5][__byte_perm(state[2].x,0,0x44442)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)])
^ shared[6][__byte_perm(state[1].x,0,0x44442)];

return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)])
^ shared[1][__byte_perm(state[6].x,0,0x44443)]
^ shared[2][__byte_perm(state[5].x,0,0x44443)]
^ shared[3][__byte_perm(state[4].x,0,0x44443)]
^ __ldg(&T42[__byte_perm(state[3].x,0,0x44443)])
^ shared[5][__byte_perm(state[2].x,0,0x44443)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)])
^ shared[6][__byte_perm(state[1].x,0,0x44443)];

return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)])
^ shared[1][__byte_perm(state[6].y,0,0x44440)]
^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)])
^ shared[3][__byte_perm(state[4].y,0,0x44440)]
^ shared[4][__byte_perm(state[3].y,0,0x44440)]
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)])
^ shared[5][__byte_perm(state[2].y,0,0x44440)]
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)]);

return_state[5] = __ldg(&T02[__byte_perm(state[7].y,0,0x44441)])
^ shared[2][__byte_perm(state[5].y,0,0x44441)]
^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)])
^ shared[3][__byte_perm(state[4].y,0,0x44441)]
^ shared[4][__byte_perm(state[3].y,0,0x44441)]
^ shared[5][__byte_perm(state[2].y,0,0x44441)]
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)])
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)]);

return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)])
^ shared[1][__byte_perm(state[6].y,0,0x44442)]
^ shared[2][__byte_perm(state[5].y,0,0x44442)]
^ shared[3][__byte_perm(state[4].y,0,0x44442)]
^ shared[4][__byte_perm(state[3].y,0,0x44442)]
^ shared[5][__byte_perm(state[2].y,0,0x44442)]
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)])
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)]);

return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)])
^ __ldg(&T12[__byte_perm(state[6].y,0,0x44443)])
^ shared[2][__byte_perm(state[5].y,0,0x44443)]
^ shared[3][__byte_perm(state[4].y,0,0x44443)]
^ shared[4][__byte_perm(state[3].y,0,0x44443)]
^ shared[5][__byte_perm(state[2].y,0,0x44443)]
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)])
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)]);
}

__device__ __forceinline__
static void GOST_FS_LDG(const uint2 shared[8][256],const uint2 *const __restrict__ state,uint2* return_state)
{
return_state[0] = __ldg(&T02[__byte_perm(state[7].x,0,0x44440)])
^ __ldg(&T12[__byte_perm(state[6].x,0,0x44440)])
^ shared[2][__byte_perm(state[5].x,0,0x44440)]
^ shared[3][__byte_perm(state[4].x,0,0x44440)]
^ shared[4][__byte_perm(state[3].x,0,0x44440)]
^ shared[5][__byte_perm(state[2].x,0,0x44440)]
^ shared[6][__byte_perm(state[1].x,0,0x44440)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44440)]);

return_state[1] = __ldg(&T02[__byte_perm(state[7].x,0,0x44441)])
^ __ldg(&T12[__byte_perm(state[6].x,0,0x44441)])
^ shared[2][__byte_perm(state[5].x,0,0x44441)]
^ shared[3][__byte_perm(state[4].x,0,0x44441)]
^ shared[4][__byte_perm(state[3].x,0,0x44441)]
^ shared[5][__byte_perm(state[2].x,0,0x44441)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44441)])
^ shared[6][__byte_perm(state[1].x,0,0x44441)];

return_state[2] = __ldg(&T02[__byte_perm(state[7].x,0,0x44442)])
^ __ldg(&T12[__byte_perm(state[6].x,0,0x44442)])
^ shared[2][__byte_perm(state[5].x,0,0x44442)]
^ shared[3][__byte_perm(state[4].x,0,0x44442)]
^ shared[4][__byte_perm(state[3].x,0,0x44442)]
^ shared[5][__byte_perm(state[2].x,0,0x44442)]
^ shared[6][__byte_perm(state[1].x,0,0x44442)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44442)]);

return_state[3] = __ldg(&T02[__byte_perm(state[7].x,0,0x44443)])
^ __ldg(&T12[__byte_perm(state[6].x,0,0x44443)])
^ shared[2][__byte_perm(state[5].x,0,0x44443)]
^ shared[3][__byte_perm(state[4].x,0,0x44443)]
^ shared[4][__byte_perm(state[3].x,0,0x44443)]
^ shared[5][__byte_perm(state[2].x,0,0x44443)]
^ shared[6][__byte_perm(state[1].x,0,0x44443)]
^ __ldg(&T72[__byte_perm(state[0].x,0,0x44443)]);

return_state[4] = __ldg(&T02[__byte_perm(state[7].y,0,0x44440)])
^ shared[1][__byte_perm(state[6].y,0,0x44440)]
^ __ldg(&T22[__byte_perm(state[5].y,0,0x44440)])
^ shared[3][__byte_perm(state[4].y,0,0x44440)]
^ shared[4][__byte_perm(state[3].y,0,0x44440)]
^ shared[5][__byte_perm(state[2].y,0,0x44440)]
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44440)])
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44440)]);

return_state[5] = __ldg(&T02[__byte_perm(state[7].y,0,0x44441)])
^ __ldg(&T12[__byte_perm(state[6].y,0,0x44441)])
^ shared[2][__byte_perm(state[5].y,0,0x44441)]
^ shared[3][__byte_perm(state[4].y,0,0x44441)]
^ shared[4][__byte_perm(state[3].y,0,0x44441)]
^ shared[5][__byte_perm(state[2].y,0,0x44441)]
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44441)])
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44441)]);

return_state[6] = __ldg(&T02[__byte_perm(state[7].y,0,0x44442)])
^ __ldg(&T12[__byte_perm(state[6].y,0,0x44442)])
^ __ldg(&T22[__byte_perm(state[5].y,0,0x44442)])
^ shared[3][__byte_perm(state[4].y,0,0x44442)]
^ shared[4][__byte_perm(state[3].y,0,0x44442)]
^ shared[5][__byte_perm(state[2].y,0,0x44442)]
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44442)])
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44442)]);

return_state[7] = __ldg(&T02[__byte_perm(state[7].y,0,0x44443)])
^ shared[1][__byte_perm(state[6].y,0,0x44443)]
^ __ldg(&T22[__byte_perm(state[5].y,0,0x44443)])
^ shared[3][__byte_perm(state[4].y,0,0x44443)]
^ shared[4][__byte_perm(state[3].y,0,0x44443)]
^ shared[5][__byte_perm(state[2].y,0,0x44443)]
^ __ldg(&T72[__byte_perm(state[0].y,0,0x44443)])
^ __ldg(&T62[__byte_perm(state[1].y,0,0x44443)]);
}

__device__ __forceinline__
static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uint2 *const __restrict__ state)
{
uint2 t[8];
for(int i=0; i<12; i++){
GOST_FS(shared,state, t);

#pragma unroll 8
for(int j=0;j<8;j++)
K[ j] ^= *(uint2*)&CC[i][j];

#pragma unroll 8
for(int j=0;j<8;j++)
state[ j] = t[ j];

GOST_FS_LDG(shared,K, t);

#pragma unroll 8
for(int j=0;j<8;j++)
state[ j]^= t[ j];

#pragma unroll 8
for(int j=0;j<8;j++)
K[ j] = t[ j];
}
}

#define TPB 256
__global__
#if __CUDA_ARCH__ > 500
__launch_bounds__(TPB, 3)
#else
__launch_bounds__(TPB, 3)
#endif
void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint2 buf[8], t[8], temp[8], K0[8], hash[8];

__shared__ uint2 shared[8][256];
shared[0][threadIdx.x] = __ldg(&T02[threadIdx.x]);
shared[1][threadIdx.x] = __ldg(&T12[threadIdx.x]);
shared[2][threadIdx.x] = __ldg(&T22[threadIdx.x]);
shared[3][threadIdx.x] = __ldg(&T32[threadIdx.x]);
shared[4][threadIdx.x] = __ldg(&T42[threadIdx.x]);
shared[5][threadIdx.x] = __ldg(&T52[threadIdx.x]);
shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]);
shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]);

uint64_t* inout = &g_hash[thread<<3];

*(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
*(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);

__threadfence_block();

K0[0] = vectorize(0x74a5d4ce2efc83b3);

#pragma unroll 8
for(int i=0;i<8;i++){
buf[ i] = K0[ 0] ^ hash[ i];
}

for(int i=0; i<12; i++){
GOST_FS(shared, buf, temp);
#pragma unroll 8
for(uint32_t j=0;j<8;j++){
buf[ j] = temp[ j] ^ *(uint2*)&precomputed_values[i][j];
}
}
#pragma unroll 8
for(int j=0;j<8;j++){
buf[ j]^= hash[ j];
}
#pragma unroll 8
for(int j=0;j<8;j++){
K0[ j] = buf[ j];
}

K0[7].y ^= 0x00020000;

GOST_FS(shared, K0, t);

#pragma unroll 8
for(int i=0;i<8;i++)
K0[ i] = t[ i];

t[7].y ^= 0x01000000;

GOST_E12(shared, K0, t);

#pragma unroll 8
for(int j=0;j<8;j++)
buf[ j] ^= t[ j];

buf[7].y ^= 0x01000000;

GOST_FS(shared, buf,K0);

buf[7].y ^= 0x00020000;

#pragma unroll 8
for(int j=0;j<8;j++)
t[ j] = K0[ j];

t[7].y ^= 0x00020000;

GOST_E12(shared, K0, t);

#pragma unroll 8
for(int j=0;j<8;j++)
buf[ j] ^= t[ j];

GOST_FS(shared, buf,K0); // K = F(h)

hash[7]+= vectorize(0x0100000000000000);

#pragma unroll 8
for(int j=0;j<8;j++)
t[ j] = K0[ j] ^ hash[ j];

GOST_E12(shared, K0, t);

*(uint2x4*)&inout[0] = *(uint2x4*)&t[0] ^ *(uint2x4*)&hash[0] ^ *(uint2x4*)&buf[0];
*(uint2x4*)&inout[4] = *(uint2x4*)&t[4] ^ *(uint2x4*)&hash[4] ^ *(uint2x4*)&buf[4];
}

__host__
void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash)
{
dim3 grid((threads + TPB-1) / TPB);
dim3 block(TPB);
streebog_gpu_hash_64_maxwell <<<grid, block>>> ((uint64_t*)d_hash);
}
Loading

0 comments on commit 5a90db1

Please sign in to comment.