openwall · magnumripper · May 15, 2012 · Apr 30, 2012 · May 12, 2012 · May 15, 2012
diff --git a/doc/README-CUDA b/doc/README-CUDA
@@ -10,12 +10,17 @@ Performance issues:
   If you have got Fermi or newer card change "-arch sm_10" to "-arch sm_20" in the NVCC_FLAGS (Makefile).
   Default THREADS and BLOCKS settings might not be optimal.
   To get better performance you can experiment with THREADS and BLOCKS macros defined for each format in cuda*.h file.
-  For MSCash2:
+  For MSCash2[1]:
     CARD NAME	BLOCKS	THREADS	SM	RESULT
     GTX460	14	128	20	14194 c/s
-  For WPAPSK:
+  For WPAPSK[1]:
     CARD NAME	BLOCKS	THREADS	SM	RESULT
     GTX460	14	256	20	15058 c/s
+  For XSHA512[2]:
+    CARD NAME	BLOCKS	THREADS	SM	RESULT
+    GTX570	1600	256	??	67385K c/s
 
-
-You can contact me at lukas[dot]odzioba[at]gmail[dot]com or john-dev mailing list
+You can contact us at 
+[1] lukas[dot]odzioba[at]gmail[dot]com 
+[2] qqlddg[at]gmail[dot]com
+or john-dev mailing list
diff --git a/src/cuda_xsha512.h b/src/cuda_xsha512.h
@@ -12,10 +12,10 @@
 #define uint32_t unsigned int
 #define uint64_t unsigned long long int
 
-#define BLOCKS 32
-#define THREADS 128
+#define BLOCKS 1024
+#define THREADS 512
 #define KEYS_PER_CRYPT (BLOCKS*THREADS)
-#define ITERATIONS 8
+#define ITERATIONS 1
 #define MIN_KEYS_PER_CRYPT	(KEYS_PER_CRYPT)
 #define MAX_KEYS_PER_CRYPT	(ITERATIONS*KEYS_PER_CRYPT)
 
@@ -27,11 +27,7 @@
 #define FULL_BINARY_SIZE 64
 #endif
 
-#if 1
-#define PLAINTEXT_LENGTH		107
-#else
 #define PLAINTEXT_LENGTH		12
-#endif
 #define CIPHERTEXT_LENGTH		136
 
 extern uint8_t xsha512_key_changed;

diff --git a/src/opencl/xsha512_kernel.cl b/src/opencl/xsha512_kernel.cl
@@ -4,6 +4,10 @@
 * Redistribution and use in source and binary forms, with or without modification, are permitted.
 */
 
+#ifdef cl_khr_byte_addressable_store
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : disable
+#endif
+
 #define uint8_t  unsigned char
 #define uint32_t unsigned int
 #define uint64_t unsigned long
@@ -47,24 +51,19 @@
 
 
 typedef struct { // notice memory align problem
-	uint8_t buffer[128];	//1024 bits
-	uint32_t buflen;
 	uint64_t H[8];
+	uint32_t buffer[32];	//1024 bits
+	uint32_t buflen;
 } xsha512_ctx;
 
-
-typedef struct {
-    uint8_t v[SALT_SIZE]; // 32 bits
-} xsha512_salt;
-
 typedef struct {
     uint8_t length;
     char v[PLAINTEXT_LENGTH+1];
 } xsha512_key;
 
 
-
-#define hash_addr(j,idx) (((j)*(MAX_KEYS_PER_CRYPT))+(idx))
+/* Macros for reading/writing chars from int32's */
+#define PUTCHAR(buf, index, val) (buf)[(index)>>2] = ((buf)[(index)>>2] & ~(0xffU << (((index) & 3) << 3))) + ((val) << (((index) & 3) << 3))
 
 
 __constant uint64_t k[] = {
@@ -111,42 +110,28 @@ __constant uint64_t k[] = {
 };
 
 void xsha512(__global const char* password, uint8_t pass_len, 
-	__global uint64_t* hash, uint32_t offset, __constant char* salt)
+	__global uint64_t* hash, uint32_t offset, __constant uint32_t* salt)
 {
     __private xsha512_ctx ctx;
-	//init
-    ctx.H[0] = 0x6a09e667f3bcc908UL;
-	ctx.H[1] = 0xbb67ae8584caa73bUL;
-	ctx.H[2] = 0x3c6ef372fe94f82bUL;
-	ctx.H[3] = 0xa54ff53a5f1d36f1UL;
-	ctx.H[4] = 0x510e527fade682d1UL;
-	ctx.H[5] = 0x9b05688c2b3e6c1fUL;
-	ctx.H[6] = 0x1f83d9abfb41bd6bUL;
-	ctx.H[7] = 0x5be0cd19137e2179UL;
-	ctx.buflen = 0;
-
+
+	uint32_t* b32 = ctx.buffer;
 	//set salt to buffer
-	for (uint32_t i = 0; i < SALT_SIZE; i++) {
-		ctx.buffer[i] = salt[i];
-	}
+	*b32 = *salt;
 
 	//set password to buffer
     for (uint32_t i = 0; i < pass_len; i++) {
-		ctx.buffer[i+SALT_SIZE] = password[i];
+		PUTCHAR(b32,i+SALT_SIZE,password[i]);
 	}
     ctx.buflen = pass_len+SALT_SIZE;
 
 	//append 1 to ctx buffer
 	uint32_t length = ctx.buflen;
-	uint8_t *buffer8 = &ctx.buffer[length];
-
-	*buffer8++ = 0x80;
-
-	while(++length % 4 != 0)  {
-		*buffer8++ = 0;
+	PUTCHAR(b32, length, 0x80);
+	while((++length & 3) != 0)  {
+		PUTCHAR(b32, length, 0);
 	}
 
-	uint32_t *buffer32 = (uint32_t*)buffer8;
+	uint32_t* buffer32 = b32+(length>>2);
 	for(uint32_t i = length; i < 128; i+=4) {// append 0 to 128
 		*buffer32++=0;
 	}
@@ -157,19 +142,20 @@ void xsha512(__global const char* password, uint8_t pass_len,
 
 	// sha512 main
 	int i;
-	uint64_t a = ctx.H[0];
-	uint64_t b = ctx.H[1];
-	uint64_t c = ctx.H[2];
-	uint64_t d = ctx.H[3];
-	uint64_t e = ctx.H[4];
-	uint64_t f = ctx.H[5];
-	uint64_t g = ctx.H[6];
-	uint64_t h = ctx.H[7];
-
+
+	uint64_t a = 0x6a09e667f3bcc908UL;
+	uint64_t b = 0xbb67ae8584caa73bUL;
+	uint64_t c = 0x3c6ef372fe94f82bUL;
+	uint64_t d = 0xa54ff53a5f1d36f1UL;
+	uint64_t e = 0x510e527fade682d1UL;
+	uint64_t f = 0x9b05688c2b3e6c1fUL;
+	uint64_t g = 0x1f83d9abfb41bd6bUL;
+	uint64_t h = 0x5be0cd19137e2179UL;
 
 	__private uint64_t w[16];
 
 	uint64_t *data = (uint64_t *) ctx.buffer;
+
 	#pragma unroll 16
 	for (i = 0; i < 16; i++)
 		w[i] = SWAP64(data[i]);
@@ -206,28 +192,27 @@ void xsha512(__global const char* password, uint8_t pass_len,
 		b = a;
 		a = t1 + t2;
 	}
-
 	hash[offset] = SWAP64(a);
 }
 
 __kernel void kernel_xsha512(
 	__global const xsha512_key *password, 
 	__global uint64_t *hash,
-	__constant char *salt)
+	__constant uint32_t *salt)
 {
 
     uint32_t idx = get_global_id(0);
 	for(uint32_t it = 0; it < ITERATIONS; ++it) {		
 		uint32_t offset = idx+it*KEYS_PER_CRYPT;
-    	xsha512((__global const char*)password[offset].v, password[offset].length, 
+    	xsha512(password[offset].v, password[offset].length, 
 			hash, offset, salt);
 	}
 }
 
 __kernel void kernel_cmp(
 	__constant uint64_t* binary, 
 	__global uint64_t *hash,
-	__global uint8_t* result)
+	__global uint32_t* result)
 {
     uint32_t idx = get_global_id(0);
 	if(idx == 0)

diff --git a/src/opencl_xsha512_fmt.c b/src/opencl_xsha512_fmt.c
@@ -68,9 +68,9 @@
 #define CIPHERTEXT_LENGTH 136
 
 typedef struct { // notice memory align problem
-	uint8_t buffer[128];	//1024bits
-	uint32_t buflen;
 	uint64_t H[8];
+	uint32_t buffer[32];	//1024 bits
+	uint32_t buflen;
 } xsha512_ctx;
 
 
@@ -143,6 +143,73 @@ static char *get_key(int index)
 	return gkey[index].v;
 }
 
+static void find_best_workgroup()
+{
+	cl_event myEvent;
+	cl_ulong startTime, endTime, kernelExecTimeNs = CL_ULONG_MAX;
+	cl_ulong sumStartTime, sumEndTime;
+	size_t my_work_group = 1;
+	cl_int ret_code;
+	int i;
+	size_t max_group_size;
+	size_t work_size = KEYS_PER_CRYPT;
+    HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[gpu_id], 
+		CL_KERNEL_WORK_GROUP_SIZE,sizeof (max_group_size), &max_group_size, 
+		NULL), "Error querying CL_DEVICE_MAX_WORK_GROUP_SIZE");
+
+	cl_command_queue queue_prof =
+	    clCreateCommandQueue(context[gpu_id], devices[gpu_id],
+	    CL_QUEUE_PROFILING_ENABLE,
+	    &ret_code);
+	HANDLE_CLERROR(ret_code, "Error while creating command queue");
+
+	/// Set keys
+	char *pass = "password";
+	for (i = 0; i < MAX_KEYS_PER_CRYPT; i++) {
+		set_key(pass, i);
+	}
+
+	///Set salt
+	memcpy(gsalt.v, "abcd", SALT_SIZE);
+
+	///Copy data to GPU
+	HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, mem_in, CL_FALSE, 0,
+		insize, gkey, 0, NULL, NULL), "Copy data to gpu");
+	HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, mem_salt, CL_FALSE,
+			0, SALT_SIZE, &gsalt, 0, NULL, NULL), "Copy memsalt");
+
+	my_work_group = 1;
+	if (get_device_type(gpu_id) == CL_DEVICE_TYPE_GPU) 
+		my_work_group = 32;
+
+	///Find best local work size
+	for (; (int) my_work_group <= (int) max_group_size; my_work_group *= 2) {
+		sumStartTime = 0;
+		sumEndTime = 0;
+		for (i = 0; i < 10; ++i) {
+			HANDLE_CLERROR(clEnqueueNDRangeKernel(queue_prof, crypt_kernel,
+				1, NULL, &work_size, &my_work_group, 0, NULL,
+				&myEvent), "Run kernel");
+			HANDLE_CLERROR(clFinish(queue_prof), "clFinish error");
+
+			clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
+			    sizeof(cl_ulong), &startTime, NULL);
+			clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
+			    sizeof(cl_ulong), &endTime, NULL);
+			sumStartTime += startTime;
+			sumEndTime += endTime;
+		}
+		if ((sumEndTime - sumStartTime) < kernelExecTimeNs) {
+			kernelExecTimeNs = sumEndTime - sumStartTime;
+			local_work_size = my_work_group;
+		}
+		//printf("%d time=%lld\n",(int) my_work_group, endTime-startTime);
+	}
+	printf("Optimal Group work Size = %d\n", (int) local_work_size);
+	clReleaseCommandQueue(queue_prof);
+}
+
+
 static void init(struct fmt_main *pFmt)
 {
 	opencl_init("$JOHN/xsha512_kernel.cl", gpu_id, platform_id);
@@ -165,7 +232,7 @@ static void init(struct fmt_main *pFmt)
 		&ret_code);
 	HANDLE_CLERROR(ret_code,"Error while alocating memory for binary");
 	mem_cmp =
-		clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY, sizeof(uint8_t), NULL,
+		clCreateBuffer(context[gpu_id], CL_MEM_WRITE_ONLY, sizeof(uint32_t), NULL,
 		&ret_code);
 	HANDLE_CLERROR(ret_code,"Error while alocating memory for cmp_all result");
 
@@ -183,11 +250,8 @@ static void init(struct fmt_main *pFmt)
 	clSetKernelArg(cmp_kernel, 1, sizeof(mem_out), &mem_out);
 	clSetKernelArg(cmp_kernel, 2, sizeof(mem_cmp), &mem_cmp);
 
-	//find_best_workgroup();
-    HANDLE_CLERROR(clGetKernelWorkGroupInfo(crypt_kernel, devices[gpu_id], CL_KERNEL_WORK_GROUP_SIZE,
-               sizeof (local_work_size), &local_work_size, NULL),
-               "Error querying CL_DEVICE_MAX_WORK_GROUP_SIZE");
-	printf("Local work size = %d\n", (int)local_work_size);
+	find_best_workgroup();
+
 	printf("Global work size = %lld\n",(long long)global_work_size);
 	atexit(release_all);
 
@@ -377,10 +441,6 @@ static int salt_hash(void *salt)
 
 static void set_salt(void *salt)
 {
-	HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_FALSE, 0,
-		outsize, ghash, 0, NULL, NULL), "Copy data back");
-	HANDLE_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
-
 	memcpy(gsalt.v, (uint8_t*)salt, SALT_SIZE);
 }
 
@@ -401,8 +461,6 @@ static void crypt_all(int count)
 	HANDLE_CLERROR(clEnqueueNDRangeKernel
 	    (queue[gpu_id], crypt_kernel, 1, NULL, &worksize, &localworksize,
 		0, NULL, NULL), "Set ND range");
-//	HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_out, CL_FALSE, 0,
-//		outsize, ghash, 0, NULL, NULL), "Copy data back");
 
 	///Await completion of all the above
 	HANDLE_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
@@ -424,25 +482,15 @@ static int cmp_all(void *binary, int count)
 	    (queue[gpu_id], cmp_kernel, 1, NULL, &worksize, &localworksize,
 		0, NULL, NULL), "Set ND range");
 
-	uint8_t result;
+	uint32_t result;
 	/// Copy result out
 	HANDLE_CLERROR(clEnqueueReadBuffer(queue[gpu_id], mem_cmp, CL_FALSE, 0,
-		sizeof(uint8_t), &result, 0, NULL, NULL), "Copy data back");
+		sizeof(uint32_t), &result, 0, NULL, NULL), "Copy data back");
 
 	///Await completion of all the above
 	HANDLE_CLERROR(clFinish(queue[gpu_id]), "clFinish error");
 	return result;
 
-/*	uint64_t b0 = *((uint64_t *)binary+3);
-	uint64_t* h = (uint64_t*)ghash;
-	int i;
-
-	for (i = 0; i < count; i++) {
-		if (b0 == h[i])
-			return 1;
-	}
-	return 0;
-*/
 }
 
 static int cmp_one(void *binary, int index)