Skip to content

Commit

Permalink
filter invalid sols on-device (large PCIe bw and CPU savings)
Browse files Browse the repository at this point in the history
  • Loading branch information
mbevand committed Nov 10, 2016
1 parent 2543323 commit 146b8dc
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 11 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Current tip

* Optimizations: reduce size of collisions[] array; +7% speed increase measured
* Optimization: significantly reduce CPU usage and PCIe bandwidth (before:
~100 MB/s/GPU, after: 0.5 MB/s/GPU), accomplished by filtering invalid
solutions on-device
* Optimization: reduce size of collisions[] array; +7% speed increase measured
on RX 480 and R9 Nano using AMDGPU-PRO 16.40
* Update README.md with Nvidia performance numbers
* Fix mining on Xeon Phi and CPUs (fix OpenCL warnings)
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,8 @@ almost certainly bits 180-199), this is also discarded as a likely invalid
solution because this is statistically guaranteed to be all inputs repeated
at least once. This check is implemented in `kernel_sols()` (see
`likely_invalids`.)
* When input references are expanded on-GPU by `expand_refs()`, the code
checks if the last (512th) input is repeated at least once.
* Finally when the GPU returns potential solutions, the CPU also checks for
invalid solutions with duplicate inputs. This check is implemented in
`verify_sol()`.
Expand Down
37 changes: 28 additions & 9 deletions input.cl
Original file line number Diff line number Diff line change
Expand Up @@ -640,25 +640,39 @@ uint expand_ref(__global char *ht, uint xi_offset, uint row, uint slot)
slot * SLOT_LEN + xi_offset - 4);
}

void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
/*
** Expand references to inputs. Return 1 if so far the solution appears valid,
** or 0 otherwise (an invalid solution would be a solution with duplicate
** inputs, which can be detected at the last step: round == 0).
*/
uint expand_refs(uint *ins, uint nr_inputs, __global char **htabs,
uint round)
{
__global char *ht = htabs[round % 2];
uint i = nr_inputs - 1;
uint j = nr_inputs * 2 - 1;
uint xi_offset = xi_offset_for_round(round);
int dup_to_watch = -1;
do
{
ins[j] = expand_ref(ht, xi_offset,
DECODE_ROW(ins[i]), DECODE_SLOT1(ins[i]));
ins[j - 1] = expand_ref(ht, xi_offset,
DECODE_ROW(ins[i]), DECODE_SLOT0(ins[i]));
if (!round)
{
if (dup_to_watch == -1)
dup_to_watch = ins[j];
else if (ins[j] == dup_to_watch || ins[j - 1] == dup_to_watch)
return 0;
}
if (!i)
break ;
i--;
j -= 2;
}
while (1);
return 1;
}

/*
Expand All @@ -667,23 +681,28 @@ void expand_refs(__global uint *ins, uint nr_inputs, __global char **htabs,
void potential_sol(__global char **htabs, __global sols_t *sols,
uint ref0, uint ref1)
{
uint sol_i;
uint nr_values;
sol_i = atomic_inc(&sols->nr);
if (sol_i >= MAX_SOLS)
return ;
sols->valid[sol_i] = 0;
uint values_tmp[(1 << PARAM_K)];
uint sol_i;
uint i;
nr_values = 0;
sols->values[sol_i][nr_values++] = ref0;
sols->values[sol_i][nr_values++] = ref1;
values_tmp[nr_values++] = ref0;
values_tmp[nr_values++] = ref1;
uint round = PARAM_K - 1;
do
{
round--;
expand_refs(&(sols->values[sol_i][0]), nr_values, htabs, round);
if (!expand_refs(values_tmp, nr_values, htabs, round))
return ;
nr_values *= 2;
}
while (round > 0);
// solution appears valid, copy it to sols
sol_i = atomic_inc(&sols->nr);
if (sol_i >= MAX_SOLS)
return ;
for (i = 0; i < (1 << PARAM_K); i++)
sols->values[sol_i][i] = values_tmp[i];
sols->valid[sol_i] = 1;
}

Expand Down
1 change: 1 addition & 0 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,7 @@ uint32_t verify_sols(cl_command_queue queue, cl_mem buf_sols, uint64_t *nonce,
sols->nr - MAX_SOLS);
sols->nr = MAX_SOLS;
}
debug("Retrieved %d potential solutions\n", sols->nr);
nr_valid_sols = 0;
for (unsigned sol_i = 0; sol_i < sols->nr; sol_i++)
nr_valid_sols += verify_sol(sols, sol_i);
Expand Down
2 changes: 1 addition & 1 deletion param.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
// instructions. 10 is the max supported by the hw.
#define BLAKE_WPS 10
// Maximum number of solutions reported by kernel to host
#define MAX_SOLS 2000
#define MAX_SOLS 10
// Length of SHA256 target
#define SHA256_TARGET_LEN (256 / 8)

Expand Down

0 comments on commit 146b8dc

Please sign in to comment.