Permalink
Browse files

OpenCL formats: more complete release_clobj() and other tweaks.

  • Loading branch information...
1 parent 054cab5 commit 6a3aa3c7ffa148e9a24d52039609bcb5fb444b84 @magnumripper committed Nov 14, 2012
@@ -372,8 +372,8 @@ inline void sha1_init(MAYBE_VECTOR_UINT *output) {
}
__kernel void GenerateSHA1pwhash(
- __global uint *unicode_pw,
- __global uint *pw_len,
+ __global const uint *unicode_pw,
+ __global const uint *pw_len,
__constant uint *salt,
__global uint *pwhash)
{
@@ -374,8 +374,8 @@ inline void sha1_init(MAYBE_VECTOR_UINT *output) {
}
__kernel void GenerateSHA1pwhash(
- __global uint *unicode_pw,
- __global uint *pw_len,
+ __global const uint *unicode_pw,
+ __global const uint *pw_len,
__constant uint *salt,
__global uint *pwhash)
{
@@ -197,8 +197,8 @@ inline void sha512_single(MAYBE_VECTOR_ULONG *w, MAYBE_VECTOR_ULONG *output) {
}
__kernel void GenerateSHA512pwhash(
- __global ulong *unicode_pw,
- __global uint *pw_len,
+ __global const ulong *unicode_pw,
+ __global const uint *pw_len,
__constant ulong *salt,
__global ulong *pwhash)
{
View
@@ -80,20 +80,15 @@ static int VF = 1;
static cl_mem cl_saved_key, cl_challenge, cl_nthash, cl_result;
static cl_kernel ntlmv2_nthash;
-/* This is much faster but needs OpenCL 1.2 */
-#ifndef CL_MAP_WRITE_INVALIDATE_REGION
-#define CL_MAP_WRITE_INVALIDATE_REGION CL_MAP_WRITE
-#endif
-
static void create_clobj(int gws, struct fmt_main *self)
{
global_work_size = gws;
gws *= VF;
self->params.min_keys_per_crypt = self->params.max_keys_per_crypt = gws;
- cl_saved_key = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, 64 * gws, NULL , &ret_code);
+ cl_saved_key = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, 64 * gws, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error creating page-locked memory");
- saved_key = clEnqueueMapBuffer(queue[ocl_gpu_id], cl_saved_key, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, 64 * gws, 0, NULL, NULL, &ret_code);
+ saved_key = clEnqueueMapBuffer(queue[ocl_gpu_id], cl_saved_key, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 64 * gws, 0, NULL, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error mapping page-locked memory saved_key");
memset(saved_key, 0, 64 * gws);
@@ -105,7 +100,7 @@ static void create_clobj(int gws, struct fmt_main *self)
cl_challenge = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, SALT_SIZE_MAX, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error creating page-locked memory");
- challenge = clEnqueueMapBuffer(queue[ocl_gpu_id], cl_challenge, CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, SALT_SIZE_MAX, 0, NULL, NULL, &ret_code);
+ challenge = clEnqueueMapBuffer(queue[ocl_gpu_id], cl_challenge, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, SALT_SIZE_MAX, 0, NULL, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error mapping page-locked memory challenge");
memset(challenge, 0, SALT_SIZE_MAX);
@@ -125,10 +120,13 @@ static void release_clobj(void)
HANDLE_CLERROR(clEnqueueUnmapMemObject(queue[ocl_gpu_id], cl_challenge, challenge, 0, NULL, NULL), "Error Unmapping challenge");
HANDLE_CLERROR(clEnqueueUnmapMemObject(queue[ocl_gpu_id], cl_result, output, 0, NULL, NULL), "Error Unmapping output");
HANDLE_CLERROR(clEnqueueUnmapMemObject(queue[ocl_gpu_id], cl_saved_key, saved_key, 0, NULL, NULL), "Error Unmapping saved_key");
+
HANDLE_CLERROR(clReleaseMemObject(cl_challenge), "Release state buffer");
HANDLE_CLERROR(clReleaseMemObject(cl_result), "Release state buffer");
HANDLE_CLERROR(clReleaseMemObject(cl_saved_key), "Release state buffer");
HANDLE_CLERROR(clReleaseMemObject(cl_nthash), "Release state buffer");
+
+ challenge = NULL; output = saved_key = NULL;
}
static void clear_keys(void)
@@ -275,6 +273,8 @@ static cl_ulong gws_test(int gws, int do_benchmark, struct fmt_main *self)
if (amd_gcn(device_info[ocl_gpu_id]) && endTime - startTime > 200000000) {
if (do_benchmark)
fprintf(stderr, "exceeds 200 ms\n");
+ clReleaseCommandQueue(queue_prof);
+ release_clobj();
return 0;
}
@@ -291,6 +291,8 @@ static cl_ulong gws_test(int gws, int do_benchmark, struct fmt_main *self)
if (amd_gcn(device_info[ocl_gpu_id]) && endTime - startTime > 200000000) {
if (do_benchmark)
fprintf(stderr, "- exceeds 200 ms\n");
+ clReleaseCommandQueue(queue_prof);
+ release_clobj();
return 0;
}
@@ -307,7 +309,7 @@ static cl_ulong gws_test(int gws, int do_benchmark, struct fmt_main *self)
fprintf(stderr, "\n");
HANDLE_CLERROR(clGetEventProfilingInfo(Event[0],
- CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime,
+ CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime,
NULL), "Failed to get profiling info");
HANDLE_CLERROR(clGetEventProfilingInfo(Event[3],
CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime,
@@ -327,9 +329,7 @@ static void find_best_gws(int do_benchmark, struct fmt_main *self)
const int md5perkey = 11;
unsigned long long int MaxRunTime = 1000000000ULL;
- /* Do not allocate more than 1/4 of total GPU memory */
- max_gws = MIN(get_global_memory_size(ocl_gpu_id) / 4 / 96,
- get_max_mem_alloc_size(ocl_gpu_id) / 64);
+ max_gws = get_max_mem_alloc_size(ocl_gpu_id) / 64;
if (do_benchmark) {
fprintf(stderr, "Calculating best keys per crypt (GWS) for LWS=%zd and max. %llu s duration.\n\n", local_work_size, MaxRunTime / 1000000000UL);
@@ -97,20 +97,20 @@ static void create_clobj(int gws, struct fmt_main *self)
global_work_size = gws;
gws *= VF;
self->params.min_keys_per_crypt = self->params.max_keys_per_crypt = gws;
- cl_saved_key = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, UNICODE_LENGTH * gws, NULL , &ret_code);
+ cl_saved_key = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, UNICODE_LENGTH * gws, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error creating page-locked memory");
saved_key = (char*)clEnqueueMapBuffer(queue[ocl_gpu_id], cl_saved_key, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, UNICODE_LENGTH * gws, 0, NULL, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error mapping page-locked memory saved_key");
memset(saved_key, 0, UNICODE_LENGTH * gws);
- cl_saved_len = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_int) * gws, NULL, &ret_code);
+ cl_saved_len = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_int) * gws, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error creating page-locked memory");
saved_len = (int*)clEnqueueMapBuffer(queue[ocl_gpu_id], cl_saved_len, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int) * gws, 0, NULL, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error mapping page-locked memory saved_len");
for (i = 0; i < gws; i++)
saved_len[i] = bench_len;
- cl_salt = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, SALT_LENGTH, NULL, &ret_code);
+ cl_salt = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, SALT_LENGTH, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error creating page-locked memory");
saved_salt = (char*) clEnqueueMapBuffer(queue[ocl_gpu_id], cl_salt, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, SALT_LENGTH, 0, NULL, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error mapping page-locked memory saved_salt");
@@ -144,6 +144,14 @@ static void release_clobj(void)
HANDLE_CLERROR(clEnqueueUnmapMemObject(queue[ocl_gpu_id], cl_saved_key, saved_key, 0, NULL, NULL), "Error Unmapping saved_key");
HANDLE_CLERROR(clEnqueueUnmapMemObject(queue[ocl_gpu_id], cl_saved_len, saved_len, 0, NULL, NULL), "Error Unmapping saved_len");
HANDLE_CLERROR(clEnqueueUnmapMemObject(queue[ocl_gpu_id], cl_salt, saved_salt, 0, NULL, NULL), "Error Unmapping saved_salt");
+
+#ifndef __APPLE__ /* Triggers a bug in OSX 10.8.2 w/ MacBook Air and MacBook Pro Update 2.0 for GT 650 */
+ HANDLE_CLERROR(clReleaseMemObject(cl_key), "Release GPU buffer");
+ HANDLE_CLERROR(clReleaseMemObject(cl_saved_key), "Release GPU buffer");
+ HANDLE_CLERROR(clReleaseMemObject(cl_saved_len), "Release GPU buffer");
+ HANDLE_CLERROR(clReleaseMemObject(cl_salt), "Release GPU buffer");
+#endif
+
key = NULL; saved_key = NULL; saved_len = NULL; saved_salt = NULL;
MEM_FREE(cracked);
}
@@ -282,6 +290,8 @@ static cl_ulong gws_test(int gws, int do_benchmark, struct fmt_main *self)
if (amd_gcn(device_info[ocl_gpu_id]) && endTime - startTime > 200000000) {
if (do_benchmark)
fprintf(stderr, "- exceeds 200 ms\n");
+ clReleaseCommandQueue(queue_prof);
+ release_clobj();
return 0;
}
@@ -311,17 +321,19 @@ static void find_best_gws(int do_benchmark, struct fmt_main *self)
{
int num;
cl_ulong run_time, min_time = CL_ULONG_MAX;
- unsigned int SHAspeed, bestSHAspeed = 0;
+ unsigned int SHAspeed, bestSHAspeed = 0, max_gws;
int optimal_gws = local_work_size;
const int sha1perkey = 50004;
unsigned long long int MaxRunTime = 5000000000ULL;
+ max_gws = get_max_mem_alloc_size(ocl_gpu_id) / UNICODE_LENGTH;
+
if (do_benchmark) {
fprintf(stderr, "Calculating best keys per crypt (GWS) for LWS=%zd and max. %llu s duration.\n\n", local_work_size, MaxRunTime / 1000000000UL);
fprintf(stderr, "Raw GPU speed figures including buffer transfers:\n");
}
- for (num = local_work_size; num; num *= 2) {
+ for (num = local_work_size; max_gws; num *= 2) {
if (!do_benchmark)
advance_cursor();
if (!(run_time = gws_test(num, do_benchmark, self)))
@@ -79,7 +79,7 @@ static char *saved_key; /* Password encoded in UCS-2 */
static int *saved_len; /* UCS-2 password length, in octets */
static char *saved_salt;
static unsigned char *key; /* Output key from kernel */
-static int new_keys, *spincount;
+static int new_keys, spincount;
static cl_mem cl_saved_key, cl_saved_len, cl_salt, cl_pwhash, cl_key, cl_spincount;
static cl_kernel GenerateSHA1pwhash, Generate2010key;
@@ -92,20 +92,20 @@ static void create_clobj(int gws, struct fmt_main *self)
global_work_size = gws;
gws *= VF;
self->params.min_keys_per_crypt = self->params.max_keys_per_crypt = gws;
- cl_saved_key = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, UNICODE_LENGTH * gws, NULL , &ret_code);
+ cl_saved_key = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, UNICODE_LENGTH * gws, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error creating page-locked memory");
saved_key = (char*)clEnqueueMapBuffer(queue[ocl_gpu_id], cl_saved_key, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, UNICODE_LENGTH * gws, 0, NULL, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error mapping page-locked memory saved_key");
memset(saved_key, 0, UNICODE_LENGTH * gws);
- cl_saved_len = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_int) * gws, NULL, &ret_code);
+ cl_saved_len = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_int) * gws, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error creating page-locked memory");
saved_len = (int*)clEnqueueMapBuffer(queue[ocl_gpu_id], cl_saved_len, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int) * gws, 0, NULL, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error mapping page-locked memory saved_len");
for (i = 0; i < gws; i++)
saved_len[i] = bench_len;
- cl_salt = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, SALT_LENGTH, NULL, &ret_code);
+ cl_salt = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, SALT_LENGTH, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error creating page-locked memory");
saved_salt = (char*) clEnqueueMapBuffer(queue[ocl_gpu_id], cl_salt, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, SALT_LENGTH, 0, NULL, NULL, &ret_code);
HANDLE_CLERROR(ret_code, "Error mapping page-locked memory saved_salt");
@@ -120,11 +120,8 @@ static void create_clobj(int gws, struct fmt_main *self)
HANDLE_CLERROR(ret_code, "Error mapping page-locked memory verifier keys");
memset(key, 0, 32 * gws);
- cl_spincount = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_int), NULL, &ret_code);
- HANDLE_CLERROR(ret_code, "Error creating page-locked memory");
- spincount = (int*) clEnqueueMapBuffer(queue[ocl_gpu_id], cl_spincount, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(cl_int), 0, NULL, NULL, &ret_code);
- HANDLE_CLERROR(ret_code, "Error mapping page-locked memory spincount");
- memset(spincount, 0, sizeof(cl_int));
+ cl_spincount = clCreateBuffer(context[ocl_gpu_id], CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_int), &spincount, &ret_code);
+ HANDLE_CLERROR(ret_code, "Error mapping spincount");
HANDLE_CLERROR(clSetKernelArg(GenerateSHA1pwhash, 0, sizeof(cl_mem), (void*)&cl_saved_key), "Error setting argument 0");
HANDLE_CLERROR(clSetKernelArg(GenerateSHA1pwhash, 1, sizeof(cl_mem), (void*)&cl_saved_len), "Error setting argument 1");
@@ -142,12 +139,18 @@ static void create_clobj(int gws, struct fmt_main *self)
static void release_clobj(void)
{
- HANDLE_CLERROR(clEnqueueUnmapMemObject(queue[ocl_gpu_id], cl_spincount, spincount, 0, NULL, NULL), "Error Unmapping spincount");
HANDLE_CLERROR(clEnqueueUnmapMemObject(queue[ocl_gpu_id], cl_key, key, 0, NULL, NULL), "Error Unmapping key");
HANDLE_CLERROR(clEnqueueUnmapMemObject(queue[ocl_gpu_id], cl_saved_key, saved_key, 0, NULL, NULL), "Error Unmapping saved_key");
HANDLE_CLERROR(clEnqueueUnmapMemObject(queue[ocl_gpu_id], cl_saved_len, saved_len, 0, NULL, NULL), "Error Unmapping saved_len");
HANDLE_CLERROR(clEnqueueUnmapMemObject(queue[ocl_gpu_id], cl_salt, saved_salt, 0, NULL, NULL), "Error Unmapping saved_salt");
- spincount = NULL; key = NULL; saved_key = NULL; saved_len = NULL; saved_salt = NULL;
+
+ HANDLE_CLERROR(clReleaseMemObject(cl_spincount), "Release GPU buffer");
+ HANDLE_CLERROR(clReleaseMemObject(cl_key), "Release GPU buffer");
+ HANDLE_CLERROR(clReleaseMemObject(cl_saved_key), "Release GPU buffer");
+ HANDLE_CLERROR(clReleaseMemObject(cl_saved_len), "Release GPU buffer");
+ HANDLE_CLERROR(clReleaseMemObject(cl_salt), "Release GPU buffer");
+
+ key = NULL; saved_key = NULL; saved_len = NULL; saved_salt = NULL;
MEM_FREE(cracked);
}
@@ -212,9 +215,9 @@ static void set_salt(void *salt)
{
cur_salt = (struct custom_salt *)salt;
memcpy(saved_salt, cur_salt->osalt, SALT_LENGTH);
- *spincount = cur_salt->spinCount;
+ spincount = cur_salt->spinCount;
HANDLE_CLERROR(clEnqueueWriteBuffer(queue[ocl_gpu_id], cl_salt, CL_FALSE, 0, SALT_LENGTH, saved_salt, 0, NULL, NULL), "failed in clEnqueueWriteBuffer saved_salt");
- HANDLE_CLERROR(clEnqueueWriteBuffer(queue[ocl_gpu_id], cl_spincount, CL_FALSE, 0, 4, spincount, 0, NULL, NULL), "failed in clEnqueueWriteBuffer spincount");
+ HANDLE_CLERROR(clEnqueueWriteBuffer(queue[ocl_gpu_id], cl_spincount, CL_FALSE, 0, 4, &spincount, 0, NULL, NULL), "failed in clEnqueueWriteBuffer spincount");
}
static cl_ulong gws_test(int gws, int do_benchmark, struct fmt_main *self)
@@ -243,7 +246,7 @@ static cl_ulong gws_test(int gws, int do_benchmark, struct fmt_main *self)
return 0;
}
- for (i = 0; i < *spincount / HASH_LOOPS - 1; i++) {
+ for (i = 0; i < spincount / HASH_LOOPS - 1; i++) {
ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
if (ret_code != CL_SUCCESS) {
fprintf(stderr, "Error: %s\n", get_error_name(ret_code));
@@ -281,12 +284,14 @@ static cl_ulong gws_test(int gws, int do_benchmark, struct fmt_main *self)
CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime,
NULL), "Failed to get profiling info");
if (do_benchmark)
- fprintf(stderr, "%.2f ms x %u = %.2f s\t", (float)((endTime - startTime)/1000000.), *spincount/HASH_LOOPS, (float)(*spincount/HASH_LOOPS) * (endTime - startTime) / 1000000000.);
+ fprintf(stderr, "%.2f ms x %u = %.2f s\t", (float)((endTime - startTime)/1000000.), spincount/HASH_LOOPS, (float)(spincount/HASH_LOOPS) * (endTime - startTime) / 1000000000.);
/* 200 ms duration limit for GCN to avoid ASIC hangs */
if (amd_gcn(device_info[ocl_gpu_id]) && endTime - startTime > 200000000) {
if (do_benchmark)
fprintf(stderr, "- exceeds 200 ms\n");
+ clReleaseCommandQueue(queue_prof);
+ release_clobj();
return 0;
}
@@ -316,17 +321,19 @@ static void find_best_gws(int do_benchmark, struct fmt_main *self)
{
int num;
cl_ulong run_time, min_time = CL_ULONG_MAX;
- unsigned int SHAspeed, bestSHAspeed = 0;
+ unsigned int SHAspeed, bestSHAspeed = 0, max_gws;
int optimal_gws = local_work_size;
int sha1perkey;
unsigned long long int MaxRunTime = 5000000000ULL;
+ max_gws = get_max_mem_alloc_size(ocl_gpu_id) / UNICODE_LENGTH;
+
if (do_benchmark) {
fprintf(stderr, "Calculating best keys per crypt (GWS) for LWS=%zd and max. %llu s duration.\n\n", local_work_size, MaxRunTime / 1000000000UL);
fprintf(stderr, "Raw GPU speed figures including buffer transfers:\n");
}
- for (num = local_work_size; num; num *= 2) {
+ for (num = local_work_size; max_gws; num *= 2) {
if (!do_benchmark)
advance_cursor();
if (!(run_time = gws_test(num, do_benchmark, self)))
@@ -496,7 +503,7 @@ static void crypt_all(int count)
HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[ocl_gpu_id], GenerateSHA1pwhash, 1, NULL, &scalar_gws, &local_work_size, 0, NULL, firstEvent), "failed in clEnqueueNDRangeKernel");
- for (index = 0; index < *spincount / HASH_LOOPS; index++)
+ for (index = 0; index < spincount / HASH_LOOPS; index++)
HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[ocl_gpu_id], crypt_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL), "failed in clEnqueueNDRangeKernel");
HANDLE_CLERROR(clEnqueueNDRangeKernel(queue[ocl_gpu_id], Generate2010key, 1, NULL, &global_work_size, &local_work_size, 0, NULL, lastEvent), "failed in clEnqueueNDRangeKernel");
Oops, something went wrong.

9 comments on commit 6a3aa3c

Hey Magnum,

I'm sorry but after this commit my --format=office2010 is again not working...
I also re-installed AMD Catalyst 12.8 and rebootet, to make sure nothing else can be the reason for failing now.

make[1]: Verlasse Verzeichnis '/home/marc/JohnTheRipper/src'
cp opencl/_.cl ../run/
cp opencl__.h ../run/
marc@nerberd:/JohnTheRipper/src$ cd ../run/
marc@nerberd:
/JohnTheRipper/run$ ./john --list=opencl-devices
Platform #0 name: AMD Accelerated Parallel Processing
Platform version: OpenCL 1.2 AMD-APP (938.2)
Device #0 name: Juniper
Device vendor: Advanced Micro Devices, Inc.
Device type: GPU (LE)
Device version: OpenCL 1.2 AMD-APP (938.2)
Driver version: CAL 1.4.1741
Global Memory: 512.5 MB
Global Memory Cache: 0.0 B
Local Memory: 32.0 KB (Local)
Max memory alloc. size: 128.1 MB
Max clock (MHz) : 675
Max Work Group Size: 256
Parallel compute cores: 10
Stream processors: 800 (10 x 80)

    Device #1 name:         Intel(R) Core(TM) i7-2670QM CPU @ 2.20GHz
    Device vendor:          GenuineIntel
    Device type:            CPU (LE)
    Device version:         OpenCL 1.2 AMD-APP (938.2)
    Driver version:         2.0 (sse2,avx)
    Global Memory:          15.0 GB
    Global Memory Cache:    32.0 KB
    Local Memory:           32.0 KB (Global)
    Max memory alloc. size: 3.0 GB
    Max clock (MHz) :       800
    Max Work Group Size:    1024
    Parallel compute cores: 8

marc@nerberd:/JohnTheRipper/run$ ./john --test --format=office2010-opencl
OpenCL platform 0: AMD Accelerated Parallel Processing, 2 device(s).
Using device 0: Juniper
Local worksize (LWS) 64, Global worksize (GWS) 4096
Benchmarking: Office 2010 SHA-1 AES (100,000 iterations) [OpenCL 4x]... Speicherzugriffsfehler (Speicherabzug geschrieben)
marc@nerberd:
/JohnTheRipper/run$

Owner

magnumripper replied Nov 15, 2012

That is really odd. If anything, this commit should make things more stable. Does the office-2007 format work fine? It is nearly identical.

Owner

magnumripper replied Nov 15, 2012

OK, I can reproduce it if forcing vectorized mode (like the Juniper uses) on the Tahiti. Try forcing scalar mode as I described on the mailing list. I bet that will work.

i tested the other modes, no Office-Opencl Mode is working.
http://pastebin.com/4eGPD81b

Once i created my own branch, using the version before this commit, it is working again.

PS: If you prefer the mailinglist, i can also write there ;)

Owner

magnumripper replied Nov 15, 2012

I still think it's a driver bug (that was randomly hidden in the previous commit). And I think disabling vectorizing will work around it, but it will have some performance penalty. I will add some way to disable vectorizing at runtime (eg. with a john.conf setting). Could you please try disabling vectorizing the way I described earlier and report the result?

Yes, I don't mind but I guess Solar would like this conversation to be on the mailing list.

Collaborator

claudioandre replied Nov 19, 2012

I tried it (i'm using Catalyst 10.10):

  • run it with --request-scalar and it fails in a segmentation fault.
  • git checkout f000d90 (make clean, make) and it fails in a segmentation fault.
  • git checkout d7fcb99 (make clean, make) and it fails in a segmentation fault.
  • git checkout dcf86ea (make clean, make) and it fails in a segmentation fault.

I'm sure it worked fine on 10.06 (and some commit in the past, probably one of these i tried above).

So, seems driver is the one to blame.

Owner

magnumripper replied Nov 19, 2012

Commit 5d4f156 added today might make some difference to the better although I don't really expect it to. If it actually does make the problems go away, I will have to apologize to AMD for blaming them :-P

Isn't there any way to get AMD drivers with debug info not stripped out? Using a debugger I just end up with lots of anonymous hex digits deep inside some AMD library. No clue whatsoever.

Collaborator

claudioandre replied Nov 20, 2012

I haven't found any drivers with debug information.

I saw a similar error on raw-sha256 (but it wasn't running on bull, only on CPU and segfault on GPU here) and it was because i was doing some misaligned access on memory. I was using a 64 bit pointer on a 32 bit aligned data.

Please sign in to comment.