Skip to content

Commit

Permalink
knc: Automatically disable and re-enable problematic cores
Browse files Browse the repository at this point in the history
  • Loading branch information
bondus authored and luke-jr committed Oct 26, 2013
1 parent 6eab134 commit 8b69095
Showing 1 changed file with 90 additions and 3 deletions.
93 changes: 90 additions & 3 deletions driver-knc.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,25 @@
#define KNC_SPI_MODE (SPI_CPHA | SPI_CPOL | SPI_CS_HIGH)
#define KNC_SPI_BITS 8


/*
The core disable/enable strategy is as follows:
If a core gets 10 HW errors in a row without doing any proper work
it is disabled for 10 seconds.
When a core gets 10 HW errors the next time it checks when it was enabled
the last time and compare that to when it started to get errors.
If those times are close (50%) the disabled time is doubled,
if not it is just disabled for 10s again.
*/

#define KNC_MAX_HWERR_IN_ROW 10
#define KNC_HWERR_DISABLE_SECS (10)
#define KNC_MAX_DISABLE_SECS (15 * 60)


static const char * const i2cpath = "/dev/i2c-2";

#define KNC_I2C_TEMPLATE "/dev/i2c-%d"
Expand All @@ -52,6 +71,11 @@ enum knc_reply_type {
KNC_REPLY_WORK_DONE = 2,
};

enum knc_i2c_core_status {
KNC_I2CSTATUS_DISABLED = 2,
KNC_I2CSTATUS_ENABLED = 3,
};

struct device_drv knc_drv;

struct knc_device {
Expand All @@ -74,6 +98,11 @@ struct knc_core {

float volt;
float current;

int hwerr_in_row;
int hwerr_disable_time;
struct timeval enable_at;
struct timeval first_hwerr;
};

static
Expand Down Expand Up @@ -256,10 +285,22 @@ bool knc_init(struct thr_info * const thr)
*knccore = (struct knc_core){
.asicno = i2cslave - 0x20,
.coreno = i + j,
.hwerr_in_row = 0,
.hwerr_disable_time = KNC_HWERR_DISABLE_SECS,
};
timer_set_now(&knccore->enable_at);
proc->device_data = knc;
if (buf[j] != 3)
proc->deven = DEV_DISABLED;
switch (buf[j])
{
case KNC_I2CSTATUS_ENABLED:
break;
default: // permanently disabled
timer_unset(&knccore->enable_at); // never enable
// fallthru
case KNC_I2CSTATUS_DISABLED:
proc->deven = DEV_DISABLED;
break;
}

proc = proc->next_proc;
if ((!proc) || proc->device == proc)
Expand Down Expand Up @@ -485,6 +526,7 @@ void knc_poll(struct thr_info * const thr)
for (i = 0; i < coreno; ++i)
proc = proc->next_proc;
mythr = proc->thr[0];
knccore = mythr->cgpu_data;

i = get_u16be(&rxbuf[2]);
HASH_FIND_INT(knc->devicework, &i, work);
Expand All @@ -509,7 +551,8 @@ void knc_poll(struct thr_info * const thr)
case KNC_REPLY_NONCE_FOUND:
nonce = get_u32be(&rxbuf[4]);
nonce = le32toh(nonce);
submit_nonce(mythr, work, nonce);
if (submit_nonce(mythr, work, nonce))
knccore->hwerr_in_row = 0;
break;
case KNC_REPLY_WORK_DONE:
HASH_DEL(knc->devicework, work);
Expand Down Expand Up @@ -597,6 +640,41 @@ float knc_dcdc_decode_5_11(uint16_t raw)
return dcdc_vin;
}

static
void knc_hw_error(struct thr_info * const thr)
{
struct cgpu_info * const proc = thr->cgpu;
struct knc_core * const knccore = thr->cgpu_data;

if(knccore->hwerr_in_row == 0)
timer_set_now(&knccore->first_hwerr);
++knccore->hwerr_in_row;

if (knccore->hwerr_in_row >= KNC_MAX_HWERR_IN_ROW && proc->deven == DEV_ENABLED)
{
struct timeval now;
timer_set_now(&now);
float first_err_dt = tdiff(&now, &knccore->first_hwerr);
float enable_dt = tdiff(&now, &knccore->enable_at);

if(first_err_dt * 1.5 > enable_dt)
{
// didn't really do much good
knccore->hwerr_disable_time *= 2;
if (knccore->hwerr_disable_time > KNC_MAX_DISABLE_SECS)
knccore->hwerr_disable_time = KNC_MAX_DISABLE_SECS;
}
else
knccore->hwerr_disable_time = KNC_HWERR_DISABLE_SECS;
proc->deven = DEV_DISABLED;
applog(LOG_WARNING, "%"PRIpreprv": Disabled. %d hwerr in %.3f / %.3f . disabled %d s",
proc->proc_repr, knccore->hwerr_in_row,
enable_dt, first_err_dt, knccore->hwerr_disable_time);

timer_set_delay_from_now(&knccore->enable_at, knccore->hwerr_disable_time * 1000000);
}
}

static
bool knc_get_stats(struct cgpu_info * const cgpu)
{
Expand All @@ -613,6 +691,7 @@ bool knc_get_stats(struct cgpu_info * const cgpu)
int i2c;
int32_t rawtemp, rawvolt, rawcurrent;
float temp, volt, current;
struct timeval tv_now;
bool rv = false;

char i2cpath[sizeof(KNC_I2C_TEMPLATE)];
Expand Down Expand Up @@ -648,6 +727,7 @@ bool knc_get_stats(struct cgpu_info * const cgpu)
Datasheet at http://www.lineagepower.com/oem/pdf/MDT040A0X.pdf
*/

timer_set_now(&tv_now);
for (proc = cgpu, i = 0; proc && proc->device == cgpu; proc = proc->next_proc, ++i)
{
thr = proc->thr[0];
Expand Down Expand Up @@ -681,6 +761,12 @@ bool knc_get_stats(struct cgpu_info * const cgpu)
proc->temp = temp;
knccore->volt = volt;
knccore->current = current;

if (proc->deven == DEV_DISABLED && timer_passed(&knccore->enable_at, &tv_now))
{
knccore->hwerr_in_row = 0;
proc_enable(proc);
}
}

rv = true;
Expand Down Expand Up @@ -727,6 +813,7 @@ struct device_drv knc_drv = {
.queue_append = knc_queue_append,
.queue_flush = knc_queue_flush,
.poll = knc_poll,
.hw_error = knc_hw_error,

.get_stats = knc_get_stats,
.get_api_extra_device_status = knc_api_extra_device_status,
Expand Down

0 comments on commit 8b69095

Please sign in to comment.