Skip to content

Commit

Permalink
npu2-opencapi: Rework link training timeout
Browse files Browse the repository at this point in the history
Opencapi link state should be polled for up to 3 seconds. Current code
assumes a tight retry loop during fundamental reset at boot, which is
not going to be true on link retraining. So update the timeout
detection code to use a timebase instead of a simple retry count which
could be way too long.

Reviewed-by: Christophe Lombard <clombard@linux.vnet.ibm.com>
Reviewed-by: Andrew Donnellan <ajd@linux.ibm.com>
Signed-off-by: Frederic Barrat <fbarrat@linux.ibm.com>
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
  • Loading branch information
fbarrat authored and oohal committed Oct 22, 2019
1 parent fed081d commit 2600cfa
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 4 deletions.
9 changes: 5 additions & 4 deletions hw/npu2-opencapi.c
Expand Up @@ -1127,13 +1127,13 @@ static int64_t npu2_opencapi_poll_link(struct pci_slot *slot)
reg = get_odl_status(chip_id, dev->brick_index);
if (GETFIELD(OB_ODL_STATUS_TRAINING_STATE_MACHINE, reg) ==
OCAPI_LINK_STATE_TRAINED) {
OCAPIINF(dev, "link trained in %lld ms\n",
OCAPI_LINK_TRAINING_TIMEOUT - slot->retries);
OCAPIINF(dev, "link trained in %ld ms\n",
tb_to_msecs(mftb() - dev->train_start));
check_trained_link(dev, reg);
pci_slot_set_state(slot, OCAPI_SLOT_LINK_TRAINED);
return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
}
if (slot->retries-- == 0)
if (tb_compare(mftb(), dev->train_timeout) == TB_AAFTERB)
return npu2_opencapi_retry_state(slot, reg);

return pci_slot_set_sm_timeout(slot, msecs_to_tb(1));
Expand Down Expand Up @@ -1239,7 +1239,8 @@ static int64_t npu2_opencapi_freset(struct pci_slot *slot)
/* Bump lanes - this improves training reliability */
npu2_opencapi_bump_ui_lane(dev);
start_training(chip_id, dev);
slot->retries = OCAPI_LINK_TRAINING_TIMEOUT;
dev->train_start = mftb();
dev->train_timeout = dev->train_start + msecs_to_tb(OCAPI_LINK_TRAINING_TIMEOUT);
pci_slot_set_state(slot, OCAPI_SLOT_LINK_START);
return slot->ops.poll_link(slot);

Expand Down
2 changes: 2 additions & 0 deletions include/npu2.h
Expand Up @@ -147,6 +147,8 @@ struct npu2_dev {
uint64_t linux_pe;
bool train_need_fence;
bool train_fenced;
unsigned long train_start;
unsigned long train_timeout;
};

struct npu2 {
Expand Down

0 comments on commit 2600cfa

Please sign in to comment.