From 233e863c8b1dccad8be7c39336d232a4a3994e6b Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Wed, 9 Oct 2019 21:38:11 +0200 Subject: [PATCH] npu2-opencapi: Log a warning when resetting a broken device On P9, the NPU doesn't support recovery if the link goes down unexpectedly. It was not fully verified. We mark the device as broken when we receive an error interrupt from the NPU. However, there's nothing to prevent the OS from trying to reset the device; It may or may not work, it's unsupported territory, so let's log a message to make it clear, as it could help when debugging. We haven't hit any cases where the reset goes badly enough that we'd want to prevent it, so let it go for now. We can revisit later if we have evidence that it's causing more problems than it is worth. Signed-off-by: Frederic Barrat Reviewed-by: Christophe Lombard Signed-off-by: Oliver O'Halloran --- hw/npu2-opencapi.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c index 5658ec6b12b6..fc9e50c3f987 100644 --- a/hw/npu2-opencapi.c +++ b/hw/npu2-opencapi.c @@ -1203,6 +1203,10 @@ static int64_t npu2_opencapi_poll_link(struct pci_slot *slot) case OCAPI_SLOT_LINK_TRAINED: otl_enabletx(chip_id, dev->npu->xscom_base, dev); pci_slot_set_state(slot, OCAPI_SLOT_NORMAL); + if (dev->flags & NPU2_DEV_BROKEN) { + OCAPIERR(dev, "Resetting a device which hit a previous error. Device recovery is not supported, so future behavior is undefined\n"); + dev->flags &= ~NPU2_DEV_BROKEN; + } check_perf_counters(dev); dev->phb_ocapi.scan_map = 1; return OPAL_SUCCESS;