Skip to content

Commit

Permalink
core/hmi: report processor recovery reason from core FIR bits on P9
Browse files Browse the repository at this point in the history
When an error is encountered that causes processor recovery, HMI is
generated if the recovery was successful. The reason is recorded in
the core FIR, which gets copied into the WOF.

In this case dump the WOF register and an error string into the OPAL
msglog.

A broken init setting led to HMIs reported in Linux as:

[    3.591547] Harmless Hypervisor Maintenance interrupt [Recovered]
[    3.591648]  Error detail: Processor Recovery done
[    3.591714]  HMER: 2040000000000000

This patch would have been useful because it tells us exactly that
the problem is in the d-side ERAT:

[  414.489690798,7] HMI: Received HMI interrupt: HMER = 0x2040000000000000
[  414.489693339,7] HMI: [Loc: UOPWR.0000000-Node0-Proc0]: P:0 C:1 T:1: Processor recovery occurred.
[  414.489699837,7] HMI: Core WOF = 0x0000000410000000 recovered error:
[  414.489701543,7] HMI: LSU - SRAM (DCACHE parity, etc)
[  414.489702341,7] HMI: LSU - ERAT multi hit

In future it will be good to unify this reporting, so Linux could
print something more useful. Until then, this gives some good data.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Stewart Smith <stewart@linux.vnet.ibm.com>
  • Loading branch information
npiggin authored and stewartsmith committed Mar 2, 2018
1 parent 8ea3ac7 commit 56a85b4
Showing 1 changed file with 59 additions and 3 deletions.
62 changes: 59 additions & 3 deletions core/hmi.c
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@
#define P8_CORE_FIR 0x10013100
#define P9_CORE_FIR 0x20010A40

/* And core WOF (Whose On First) */
#define P9_CORE_WOF 0x20010A48

/* xscom addresses for pMisc Receive Malfunction Alert Register */
#define P8_MALFUNC_ALERT 0x02020011
#define P9_MALFUNC_ALERT 0x00090022
Expand Down Expand Up @@ -215,6 +218,28 @@ static const struct core_xstop_bit_info {
{ 63, CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ },
};

static const struct core_recoverable_bit_info {
uint8_t bit; /* CORE FIR bit number */
const char *reason;
} recoverable_bits[] = {
{ 0, "IFU - SRAM (ICACHE parity, etc)" },
{ 2, "IFU - RegFile" },
{ 4, "IFU - Logic" },
{ 9, "ISU - RegFile" },
{ 11, "ISU - Logic" },
{ 13, "ISU - Recoverable due to not in MT window" },
{ 24, "VSU - Logic" },
{ 27, "VSU - DFU logic" },
{ 29, "LSU - SRAM (DCACHE parity, etc)" },
{ 31, "LSU - RegFile" },
/* The following 3 bits may be set by SRAM errors. */
{ 33, "LSU - TLB multi hit" },
{ 34, "LSU - SLB multi hit" },
{ 35, "LSU - ERAT multi hit" },
{ 37, "LSU - Logic" },
{ 39, "LSU - Recoverable due to not in MT window" },
};

static const struct nx_xstop_bit_info {
uint8_t bit; /* NX FIR bit number */
enum OpalHMI_NestAccelXstopReason reason;
Expand Down Expand Up @@ -313,6 +338,21 @@ static int read_core_fir(uint32_t chip_id, uint32_t core_id, uint64_t *core_fir)
return rc;
}

static int read_core_wof(uint32_t chip_id, uint32_t core_id, uint64_t *core_wof)
{
int rc;

switch (proc_gen) {
case proc_gen_p9:
rc = xscom_read(chip_id,
XSCOM_ADDR_P9_EC(core_id, P9_CORE_WOF), core_wof);
break;
default:
rc = OPAL_HARDWARE;
}
return rc;
}

static bool decode_core_fir(struct cpu_thread *cpu,
struct OpalHMIEvent *hmi_evt)
{
Expand Down Expand Up @@ -1069,6 +1109,7 @@ static void hmi_print_debug(const uint8_t *msg)

int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
{
struct cpu_thread *cpu = this_cpu();
int recover = 1;
uint64_t tfmr;

Expand All @@ -1084,18 +1125,33 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
* looking at TFMR register. TFMR will tell us correct state of
* TB register.
*/
this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
cpu->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
prlog(PR_DEBUG, "Received HMI interrupt: HMER = 0x%016llx\n", hmer);
if (hmi_evt)
hmi_evt->hmer = hmer;
if (hmer & SPR_HMER_PROC_RECV_DONE) {
uint32_t chip_id = pir_to_chip_id(cpu->pir);
uint32_t core_id = pir_to_core_id(cpu->pir);
uint64_t core_wof;

hmi_print_debug("Processor recovery occurred.");
if (!read_core_wof(chip_id, core_id, &core_wof)) {
int i;

prlog(PR_DEBUG, "Core WOF = 0x%016llx recovered error:\n", core_wof);
for (i = 0; i < ARRAY_SIZE(recoverable_bits); i++) {
if (core_wof & PPC_BIT(recoverable_bits[i].bit))
prlog(PR_DEBUG, "%s\n",
recoverable_bits[i].reason);
}
}

hmer &= ~SPR_HMER_PROC_RECV_DONE;
if (hmi_evt) {
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
queue_hmi_event(hmi_evt, recover);
}
hmi_print_debug("Processor recovery Done.");
}
if (hmer & SPR_HMER_PROC_RECV_ERROR_MASKED) {
hmer &= ~SPR_HMER_PROC_RECV_ERROR_MASKED;
Expand Down Expand Up @@ -1180,7 +1236,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
mtspr(SPR_HMER, hmer);
hmi_exit();
/* Set the TB state looking at TFMR register before we head out. */
this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
cpu->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
unlock(&hmi_lock);
return recover;
}
Expand Down

0 comments on commit 56a85b4

Please sign in to comment.