Skip to content

Commit

Permalink
Nvidia busywait fix (#54)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbevand committed Nov 15, 2016
1 parent 1719551 commit a6c3517
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 6 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# Current tip

* Implement mining.extranonce.subscribe (kenshirothefist)
* Avoid 100% CPU usage with Nvidia's OpenCL, aka busywait fix (Kubuxu)
* Optimization: +10% speedup, increase collision items tracked per thread
(nerdralph). 'make test' finds 196 sols again.
* Implement mining.extranonce.subscribe (kenshirothefist)

# Version 5 (11 Nov 2016)

Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ I would like to thank these persons for their contributions to SILENTARMY,
in alphabetical order:
* eXtremal
* kenshirothefist
* Kubuxu
* lhl
* nerdralph
* poiuty
Expand Down
73 changes: 68 additions & 5 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <time.h>
#include <CL/cl.h>
#include "blake.h"
#include "_kernel.h"
Expand All @@ -25,12 +26,13 @@ typedef uint32_t uint;
#define MIN(A, B) (((A) < (B)) ? (A) : (B))
#define MAX(A, B) (((A) > (B)) ? (A) : (B))

int verbose = 0;
int verbose = 0;
uint32_t show_encoded = 0;
uint64_t nr_nonces = 1;
uint32_t do_list_devices = 0;
uint32_t gpu_to_use = 0;
uint32_t mining = 0;
double kern_avg_run_time = 0;

typedef struct debug_s
{
Expand Down Expand Up @@ -113,6 +115,23 @@ void randomize(void *p, ssize_t l)
fatal("close %s: %s\n", fname, strerror(errno));
}

#define NSEC 1e-9
double timespec_to_double(struct timespec *t)
{
return ((double)t->tv_sec) + ((double) t->tv_nsec) * NSEC;
}

void double_to_timespec(double dt, struct timespec *t)
{
t->tv_sec = (long)dt;
t->tv_nsec = (long)((dt - t->tv_sec) / NSEC);
}

void get_time(struct timespec *t)
{
clock_gettime(CLOCK_MONOTONIC, t);
}

cl_mem check_clCreateBuffer(cl_context ctx, cl_mem_flags flags, size_t size,
void *host_ptr)
{
Expand Down Expand Up @@ -776,21 +795,57 @@ uint32_t verify_sol(sols_t *sols, unsigned sol_i)
*/
uint32_t verify_sols(cl_command_queue queue, cl_mem buf_sols, uint64_t *nonce,
uint8_t *header, size_t fixed_nonce_bytes, uint8_t *target,
char *job_id, uint32_t *shares)
char *job_id, uint32_t *shares, struct timespec *target_time)
{
sols_t *sols;
uint32_t nr_valid_sols;
sols = (sols_t *)malloc(sizeof (*sols));
if (!sols)
fatal("malloc: %s\n", strerror(errno));
// Most OpenCL implementations of clEnqueueReadBuffer in blocking mode are
// good, except Nvidia implementing it as a wasteful busywait, so let's
// work around it by trying to sleep just a bit less than the expected
// amount of time.
cl_event readEvent;
check_clEnqueueReadBuffer(queue, buf_sols,
CL_TRUE, // cl_bool blocking_read
CL_FALSE, // cl_bool blocking_read
0, // size_t offset
sizeof (*sols), // size_t size
sols, // void *ptr
0, // cl_uint num_events_in_wait_list
NULL, // cl_event *event_wait_list
NULL); // cl_event *event
&readEvent); // cl_event *event
// flushing is crucial to initiate the read *now* before sleeping
clFlush(queue);
struct timespec start_time;
get_time(&start_time);
double dtarget = timespec_to_double(target_time);
cl_int readStatus;
clGetEventInfo(readEvent, CL_EVENT_COMMAND_EXECUTION_STATUS,
sizeof (cl_int), &readStatus, NULL);
while (readStatus != CL_COMPLETE && SLEEP_SKIP_RATIO != 1)
{
struct timespec t;
get_time(&t);
double dt = timespec_to_double(&t);
double delta = dtarget - dt;
if (delta < 0)
break;
double_to_timespec(delta * SLEEP_RECHECK_RATIO, &t);
nanosleep(&t, NULL);
clGetEventInfo(readEvent, CL_EVENT_COMMAND_EXECUTION_STATUS,
sizeof (cl_int), &readStatus, NULL);
}
clWaitForEvents(1, &readEvent);
struct timespec end_time;
get_time(&end_time);
double dstart, dend, delta;
dstart = timespec_to_double(&start_time);
dend = timespec_to_double(&end_time);
delta = dend - dstart;
kern_avg_run_time = kern_avg_run_time * 6.0 / 10.0 + delta * (4.0 / 10.0);
kern_avg_run_time *= (1 - (double)SLEEP_SKIP_RATIO);
// let's check these solutions we just read...
if (sols->nr > MAX_SOLS)
{
fprintf(stderr, "%d (probably invalid) solutions were dropped!\n",
Expand Down Expand Up @@ -906,8 +961,16 @@ uint32_t solve_equihash(cl_context ctx, cl_command_queue queue,
global_ws = NR_ROWS;
check_clEnqueueNDRangeKernel(queue, k_sols, 1, NULL,
&global_ws, &local_work_size, 0, NULL, NULL);
// compute the expected run time of the kernels that have been queued
struct timespec start_time, target_time;
get_time(&start_time);
double dstart, dtarget = 0;
dstart = timespec_to_double(&start_time);
dtarget = dstart + kern_avg_run_time;
double_to_timespec(dtarget, &target_time);
// read solutions
sol_found = verify_sols(queue, buf_sols, nonce_ptr, header,
fixed_nonce_bytes, target, job_id, shares);
fixed_nonce_bytes, target, job_id, shares, &target_time);
clReleaseMemObject(buf_blake_st);
return sol_found;
}
Expand Down
6 changes: 6 additions & 0 deletions param.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@
// Number of collision items to track, per thread
#define COLL_DATA_SIZE_PER_TH (NR_SLOTS * 5)

// Ratio of time of sleeping before rechecking if task is done (0-1)
#define SLEEP_RECHECK_RATIO 0.60
// Ratio of time to busy wait for the solution (0-1)
// The higher value the higher CPU usage with Nvidia
#define SLEEP_SKIP_RATIO 0.005

// Make hash tables OVERHEAD times larger than necessary to store the average
// number of elements per row. The ideal value is as small as possible to
// reduce memory usage, but not too small or else elements are dropped from the
Expand Down

0 comments on commit a6c3517

Please sign in to comment.