Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Magnum jumbo #17

Merged
merged 3 commits into from
May 15, 2012
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions doc/README-CUDA
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,17 @@ Performance issues:
If you have got Fermi or newer card change "-arch sm_10" to "-arch sm_20" in the NVCC_FLAGS (Makefile).
Default THREADS and BLOCKS settings might not be optimal.
To get better performance you can experiment with THREADS and BLOCKS macros defined for each format in cuda*.h file.
For MSCash2:
For MSCash2[1]:
CARD NAME BLOCKS THREADS SM RESULT
GTX460 14 128 20 14194 c/s
For WPAPSK:
For WPAPSK[1]:
CARD NAME BLOCKS THREADS SM RESULT
GTX460 14 256 20 15058 c/s
For XSHA512[2]:
CARD NAME BLOCKS THREADS SM RESULT
GTX570 1600 256 ?? 67385K c/s


You can contact me at lukas[dot]odzioba[at]gmail[dot]com or john-dev mailing list
You can contact us at
[1] lukas[dot]odzioba[at]gmail[dot]com
[2] qqlddg[at]gmail[dot]com
or john-dev mailing list
10 changes: 3 additions & 7 deletions src/cuda_xsha512.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@
#define uint32_t unsigned int
#define uint64_t unsigned long long int

#define BLOCKS 32
#define THREADS 128
#define BLOCKS 1024
#define THREADS 512
#define KEYS_PER_CRYPT (BLOCKS*THREADS)
#define ITERATIONS 8
#define ITERATIONS 1
#define MIN_KEYS_PER_CRYPT (KEYS_PER_CRYPT)
#define MAX_KEYS_PER_CRYPT (ITERATIONS*KEYS_PER_CRYPT)

Expand All @@ -27,11 +27,7 @@
#define FULL_BINARY_SIZE 64
#endif

#if 1
#define PLAINTEXT_LENGTH 107
#else
#define PLAINTEXT_LENGTH 12
#endif
#define CIPHERTEXT_LENGTH 136

extern uint8_t xsha512_key_changed;
Expand Down
75 changes: 30 additions & 45 deletions src/opencl/xsha512_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
* Redistribution and use in source and binary forms, with or without modification, are permitted.
*/

#ifdef cl_khr_byte_addressable_store
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : disable
#endif

#define uint8_t unsigned char
#define uint32_t unsigned int
#define uint64_t unsigned long
Expand Down Expand Up @@ -47,24 +51,19 @@


typedef struct { // notice memory align problem
uint8_t buffer[128]; //1024 bits
uint32_t buflen;
uint64_t H[8];
uint32_t buffer[32]; //1024 bits
uint32_t buflen;
} xsha512_ctx;


typedef struct {
uint8_t v[SALT_SIZE]; // 32 bits
} xsha512_salt;

typedef struct {
uint8_t length;
char v[PLAINTEXT_LENGTH+1];
} xsha512_key;



#define hash_addr(j,idx) (((j)*(MAX_KEYS_PER_CRYPT))+(idx))
/* Macros for reading/writing chars from int32's */
#define PUTCHAR(buf, index, val) (buf)[(index)>>2] = ((buf)[(index)>>2] & ~(0xffU << (((index) & 3) << 3))) + ((val) << (((index) & 3) << 3))


__constant uint64_t k[] = {
Expand Down Expand Up @@ -111,42 +110,28 @@ __constant uint64_t k[] = {
};

void xsha512(__global const char* password, uint8_t pass_len,
__global uint64_t* hash, uint32_t offset, __constant char* salt)
__global uint64_t* hash, uint32_t offset, __constant uint32_t* salt)
{
__private xsha512_ctx ctx;
//init
ctx.H[0] = 0x6a09e667f3bcc908UL;
ctx.H[1] = 0xbb67ae8584caa73bUL;
ctx.H[2] = 0x3c6ef372fe94f82bUL;
ctx.H[3] = 0xa54ff53a5f1d36f1UL;
ctx.H[4] = 0x510e527fade682d1UL;
ctx.H[5] = 0x9b05688c2b3e6c1fUL;
ctx.H[6] = 0x1f83d9abfb41bd6bUL;
ctx.H[7] = 0x5be0cd19137e2179UL;
ctx.buflen = 0;


uint32_t* b32 = ctx.buffer;
//set salt to buffer
for (uint32_t i = 0; i < SALT_SIZE; i++) {
ctx.buffer[i] = salt[i];
}
*b32 = *salt;

//set password to buffer
for (uint32_t i = 0; i < pass_len; i++) {
ctx.buffer[i+SALT_SIZE] = password[i];
PUTCHAR(b32,i+SALT_SIZE,password[i]);
}
ctx.buflen = pass_len+SALT_SIZE;

//append 1 to ctx buffer
uint32_t length = ctx.buflen;
uint8_t *buffer8 = &ctx.buffer[length];

*buffer8++ = 0x80;

while(++length % 4 != 0) {
*buffer8++ = 0;
PUTCHAR(b32, length, 0x80);
while((++length & 3) != 0) {
PUTCHAR(b32, length, 0);
}

uint32_t *buffer32 = (uint32_t*)buffer8;
uint32_t* buffer32 = b32+(length>>2);
for(uint32_t i = length; i < 128; i+=4) {// append 0 to 128
*buffer32++=0;
}
Expand All @@ -157,19 +142,20 @@ void xsha512(__global const char* password, uint8_t pass_len,

// sha512 main
int i;
uint64_t a = ctx.H[0];
uint64_t b = ctx.H[1];
uint64_t c = ctx.H[2];
uint64_t d = ctx.H[3];
uint64_t e = ctx.H[4];
uint64_t f = ctx.H[5];
uint64_t g = ctx.H[6];
uint64_t h = ctx.H[7];


uint64_t a = 0x6a09e667f3bcc908UL;
uint64_t b = 0xbb67ae8584caa73bUL;
uint64_t c = 0x3c6ef372fe94f82bUL;
uint64_t d = 0xa54ff53a5f1d36f1UL;
uint64_t e = 0x510e527fade682d1UL;
uint64_t f = 0x9b05688c2b3e6c1fUL;
uint64_t g = 0x1f83d9abfb41bd6bUL;
uint64_t h = 0x5be0cd19137e2179UL;

__private uint64_t w[16];

uint64_t *data = (uint64_t *) ctx.buffer;

#pragma unroll 16
for (i = 0; i < 16; i++)
w[i] = SWAP64(data[i]);
Expand Down Expand Up @@ -206,28 +192,27 @@ void xsha512(__global const char* password, uint8_t pass_len,
b = a;
a = t1 + t2;
}

hash[offset] = SWAP64(a);
}

__kernel void kernel_xsha512(
__global const xsha512_key *password,
__global uint64_t *hash,
__constant char *salt)
__constant uint32_t *salt)
{

uint32_t idx = get_global_id(0);
for(uint32_t it = 0; it < ITERATIONS; ++it) {
uint32_t offset = idx+it*KEYS_PER_CRYPT;
xsha512((__global const char*)password[offset].v, password[offset].length,
xsha512(password[offset].v, password[offset].length,
hash, offset, salt);
}
}

__kernel void kernel_cmp(
__constant uint64_t* binary,
__global uint64_t *hash,
__global uint8_t* result)
__global uint32_t* result)
{
uint32_t idx = get_global_id(0);
if(idx == 0)
Expand Down
100 changes: 74 additions & 26 deletions src/opencl_xsha512_fmt.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,9 @@
#define CIPHERTEXT_LENGTH 136

typedef struct { // notice memory align problem
uint8_t buffer[128]; //1024bits
uint32_t buflen;
uint64_t H[8];
uint32_t buffer[32]; //1024 bits
uint32_t buflen;
} xsha512_ctx;


Expand Down Expand Up @@ -143,6 +143,73 @@ static char *get_key(int index)
return gkey[index].v;
}

static void find_best_workgroup()
{
cl_event myEvent;
cl_ulong startTime, endTime, kernelExecTimeNs = CL_ULONG_MAX;
cl_ulong sumStartTime, sumEndTime;
size_t my_work_group = 1;
cl_int ret_code;
int i;
size_t max_group_size;
size_t work_size = KEYS_PER_CRYPT;
HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[gpu_id],
CL_KERNEL_WORK_GROUP_SIZE,sizeof (max_group_size), &max_group_size,
NULL), "Error querying CL_DEVICE_MAX_WORK_GROUP_SIZE");

cl_command_queue queue_prof =
clCreateCommandQueue(context[gpu_id], devices[gpu_id],
CL_QUEUE_PROFILING_ENABLE,
&ret_code);
HANDLE_CLERROR(ret_code, "Error while creating command queue");

/// Set keys
char *pass = "password";
for (i = 0; i < MAX_KEYS_PER_CRYPT; i++) {
set_key(pass, i);
}

///Set salt
memcpy(gsalt.v, "abcd", SALT_SIZE);

///Copy data to GPU
HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, mem_in, CL_FALSE, 0,
insize, gkey, 0, NULL, NULL), "Copy data to gpu");
HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, mem_salt, CL_FALSE,
0, SALT_SIZE, &gsalt, 0, NULL, NULL), "Copy memsalt");

my_work_group = 1;
if (get_device_type(gpu_id) == CL_DEVICE_TYPE_GPU)
my_work_group = 32;

///Find best local work size
for (; (int) my_work_group <= (int) max_group_size; my_work_group *= 2) {
sumStartTime = 0;
sumEndTime = 0;
for (i = 0; i < 10; ++i) {
HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, crypt_kernel,
1, NULL, &work_size, &my_work_group, 0, NULL,
&myEvent), "Run kernel");
HANDLE_CLERROR(clFinish(queue_prof), "clFinish error");

clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
sizeof(cl_ulong), &startTime, NULL);
clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &endTime, NULL);
sumStartTime += startTime;
sumEndTime += endTime;
}
if ((sumEndTime - sumStartTime) < kernelExecTimeNs) {
kernelExecTimeNs = sumEndTime - sumStartTime;
local_work_size = my_work_group;
}
//printf("%d time=%lld\n",(int) my_work_group, endTime-startTime);
}
printf("Optimal Group work Size = %d\n", (int) local_work_size);
clReleaseCommandQueue(queue_prof);
}


static void init(struct fmt_main *pFmt)
{
opencl_init("$JOHN/xsha512_kernel.cl", gpu_id, platform_id);
Expand All @@ -165,7 +232,7 @@ static void init(struct fmt_main *pFmt)
&ret_code);
HANDLE_CLERROR(ret_code,"Error while alocating memory for binary");
mem_cmp =
clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY, sizeof(uint8_t), NULL,
clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY, sizeof(uint32_t), NULL,
&ret_code);
HANDLE_CLERROR(ret_code,"Error while alocating memory for cmp_all result");

Expand All @@ -183,11 +250,8 @@ static void init(struct fmt_main *pFmt)
clSetKernelArg(cmp_kernel, 1, sizeof(mem_out), &mem_out);
clSetKernelArg(cmp_kernel, 2, sizeof(mem_cmp), &mem_cmp);

//find_best_workgroup();
HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[gpu_id], CL_KERNEL_WORK_GROUP_SIZE,
sizeof (local_work_size), &local_work_size, NULL),
"Error querying CL_DEVICE_MAX_WORK_GROUP_SIZE");
printf("Local work size = %d\n", (int)local_work_size);
find_best_workgroup();

printf("Global work size = %lld\n",(long long)global_work_size);
atexit(release_all);

Expand Down Expand Up @@ -377,10 +441,6 @@ static int salt_hash(void *salt)

static void set_salt(void *salt)
{
HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_FALSE, 0,
outsize, ghash, 0, NULL, NULL), "Copy data back");
HANDLE_CLERROR(clFinish(queue[gpu_id]), "clFinish error");

memcpy(gsalt.v, (uint8_t*)salt, SALT_SIZE);
}

Expand All @@ -401,8 +461,6 @@ static void crypt_all(int count)
HANDLE_CLERROR(clEnqueueNDRangeKernel
(queue[gpu_id], crypt_kernel, 1, NULL, &worksize, &localworksize,
0, NULL, NULL), "Set ND range");
// HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_FALSE, 0,
// outsize, ghash, 0, NULL, NULL), "Copy data back");

///Await completion of all the above
HANDLE_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
Expand All @@ -424,25 +482,15 @@ static int cmp_all(void *binary, int count)
(queue[gpu_id], cmp_kernel, 1, NULL, &worksize, &localworksize,
0, NULL, NULL), "Set ND range");

uint8_t result;
uint32_t result;
/// Copy result out
HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_cmp, CL_FALSE, 0,
sizeof(uint8_t), &result, 0, NULL, NULL), "Copy data back");
sizeof(uint32_t), &result, 0, NULL, NULL), "Copy data back");

///Await completion of all the above
HANDLE_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
return result;

/* uint64_t b0 = *((uint64_t *)binary+3);
uint64_t* h = (uint64_t*)ghash;
int i;

for (i = 0; i < count; i++) {
if (b0 == h[i])
return 1;
}
return 0;
*/
}

static int cmp_one(void *binary, int index)
Expand Down