Nvidia busywait fix (#54)

mbevand · Nov 15, 2016 · a6c3517 · a6c3517
1 parent 1719551
commit a6c3517
Show file tree

Hide file tree

Showing 4 changed files with 77 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,9 @@
 # Current tip
 
-* Implement mining.extranonce.subscribe (kenshirothefist)
+* Avoid 100% CPU usage with Nvidia's OpenCL, aka busywait fix (Kubuxu)
 * Optimization: +10% speedup, increase collision items tracked per thread
   (nerdralph). 'make test' finds 196 sols again.
+* Implement mining.extranonce.subscribe (kenshirothefist)
 
 # Version 5 (11 Nov 2016)
 

diff --git a/README.md b/README.md
@@ -267,6 +267,7 @@ I would like to thank these persons for their contributions to SILENTARMY,
 in alphabetical order:
 * eXtremal
 * kenshirothefist
+* Kubuxu
 * lhl
 * nerdralph
 * poiuty

diff --git a/main.c b/main.c
@@ -13,6 +13,7 @@
 #include <unistd.h>
 #include <getopt.h>
 #include <errno.h>
+#include <time.h>
 #include <CL/cl.h>
 #include "blake.h"
 #include "_kernel.h"
@@ -25,12 +26,13 @@ typedef uint32_t	uint;
 #define MIN(A, B)	(((A) < (B)) ? (A) : (B))
 #define MAX(A, B)	(((A) > (B)) ? (A) : (B))
 
-int             verbose = 0;
+int		verbose = 0;
 uint32_t	show_encoded = 0;
 uint64_t	nr_nonces = 1;
 uint32_t	do_list_devices = 0;
 uint32_t	gpu_to_use = 0;
 uint32_t	mining = 0;
+double		kern_avg_run_time = 0;
 
 typedef struct  debug_s
 {
@@ -113,6 +115,23 @@ void randomize(void *p, ssize_t l)
 	fatal("close %s: %s\n", fname, strerror(errno));
 }
 
+#define NSEC 1e-9
+double timespec_to_double(struct timespec *t)
+{
+    return ((double)t->tv_sec) + ((double) t->tv_nsec) * NSEC;
+}
+
+void double_to_timespec(double dt, struct timespec *t)
+{
+    t->tv_sec = (long)dt;
+    t->tv_nsec = (long)((dt - t->tv_sec) / NSEC);
+}
+
+void get_time(struct timespec *t)
+{
+    clock_gettime(CLOCK_MONOTONIC, t);
+}
+
 cl_mem check_clCreateBuffer(cl_context ctx, cl_mem_flags flags, size_t size,
 	void *host_ptr)
 {
@@ -776,21 +795,57 @@ uint32_t verify_sol(sols_t *sols, unsigned sol_i)
 */
 uint32_t verify_sols(cl_command_queue queue, cl_mem buf_sols, uint64_t *nonce,
 	uint8_t *header, size_t fixed_nonce_bytes, uint8_t *target,
-       	char *job_id, uint32_t *shares)
+	char *job_id, uint32_t *shares, struct timespec *target_time)
 {
     sols_t	*sols;
     uint32_t	nr_valid_sols;
     sols = (sols_t *)malloc(sizeof (*sols));
     if (!sols)
 	fatal("malloc: %s\n", strerror(errno));
+    // Most OpenCL implementations of clEnqueueReadBuffer in blocking mode are
+    // good, except Nvidia implementing it as a wasteful busywait, so let's
+    // work around it by trying to sleep just a bit less than the expected
+    // amount of time.
+    cl_event readEvent;
     check_clEnqueueReadBuffer(queue, buf_sols,
-	    CL_TRUE,	// cl_bool	blocking_read
+	    CL_FALSE,	// cl_bool	blocking_read
 	    0,		// size_t	offset
 	    sizeof (*sols),	// size_t	size
 	    sols,	// void		*ptr
 	    0,		// cl_uint	num_events_in_wait_list
 	    NULL,	// cl_event	*event_wait_list
-	    NULL);	// cl_event	*event
+	    &readEvent);	// cl_event	*event
+    // flushing is crucial to initiate the read *now* before sleeping
+    clFlush(queue);
+    struct timespec start_time;
+    get_time(&start_time);
+    double dtarget = timespec_to_double(target_time);
+    cl_int readStatus;
+    clGetEventInfo(readEvent, CL_EVENT_COMMAND_EXECUTION_STATUS,
+	    sizeof (cl_int), &readStatus, NULL);
+    while (readStatus != CL_COMPLETE && SLEEP_SKIP_RATIO != 1)
+      {
+	struct timespec t;
+	get_time(&t);
+	double dt = timespec_to_double(&t);
+	double delta = dtarget - dt;
+	if (delta < 0)
+	    break;
+	double_to_timespec(delta * SLEEP_RECHECK_RATIO, &t);
+	nanosleep(&t, NULL);
+	clGetEventInfo(readEvent, CL_EVENT_COMMAND_EXECUTION_STATUS,
+		sizeof (cl_int), &readStatus, NULL);
+      }
+    clWaitForEvents(1, &readEvent);
+    struct timespec end_time;
+    get_time(&end_time);
+    double dstart, dend, delta;
+    dstart = timespec_to_double(&start_time);
+    dend = timespec_to_double(&end_time);
+    delta = dend - dstart;
+    kern_avg_run_time = kern_avg_run_time * 6.0 / 10.0 + delta * (4.0 / 10.0);
+    kern_avg_run_time *= (1 - (double)SLEEP_SKIP_RATIO);
+    // let's check these solutions we just read...
     if (sols->nr > MAX_SOLS)
       {
 	fprintf(stderr, "%d (probably invalid) solutions were dropped!\n",
@@ -906,8 +961,16 @@ uint32_t solve_equihash(cl_context ctx, cl_command_queue queue,
     global_ws = NR_ROWS;
     check_clEnqueueNDRangeKernel(queue, k_sols, 1, NULL,
 	    &global_ws, &local_work_size, 0, NULL, NULL);
+    // compute the expected run time of the kernels that have been queued
+    struct timespec start_time, target_time;
+    get_time(&start_time);
+    double dstart, dtarget = 0;
+    dstart = timespec_to_double(&start_time);
+    dtarget = dstart + kern_avg_run_time;
+    double_to_timespec(dtarget, &target_time);
+    // read solutions
     sol_found = verify_sols(queue, buf_sols, nonce_ptr, header,
-	    fixed_nonce_bytes, target, job_id, shares);
+	    fixed_nonce_bytes, target, job_id, shares, &target_time);
     clReleaseMemObject(buf_blake_st);
     return sol_found;
 }

diff --git a/param.h b/param.h
@@ -13,6 +13,12 @@
 // Number of collision items to track, per thread
 #define COLL_DATA_SIZE_PER_TH		(NR_SLOTS * 5)
 
+// Ratio of time of sleeping before rechecking if task is done (0-1)
+#define SLEEP_RECHECK_RATIO 0.60
+// Ratio of time to busy wait for the solution (0-1)
+// The higher value the higher CPU usage with Nvidia
+#define SLEEP_SKIP_RATIO 0.005
+
 // Make hash tables OVERHEAD times larger than necessary to store the average
 // number of elements per row. The ideal value is as small as possible to
 // reduce memory usage, but not too small or else elements are dropped from the