Skip to content

Commit

Permalink
opal/hmi: Remove races in clearing HMER
Browse files Browse the repository at this point in the history
Writing to HMER acts as an "AND". The current code writes back the
value we originally read with the bits we handled cleared. This is
racy, if a new bit gets set in HW after the original read, we'll end
up clearing it without handling it.

Instead, use an all 1's mask with only the bit handled cleared.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
  • Loading branch information
ozbenh authored and stewartsmith committed Apr 17, 2018
1 parent 4344628 commit a9d92e2
Showing 1 changed file with 12 additions and 10 deletions.
22 changes: 12 additions & 10 deletions core/hmi.c
Original file line number Diff line number Diff line change
Expand Up @@ -1139,7 +1139,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
{
struct cpu_thread *cpu = this_cpu();
int recover = 1;
uint64_t tfmr;
uint64_t tfmr, handled = 0;

/*
* In case of split core, some of the Timer facility errors need
Expand Down Expand Up @@ -1174,15 +1174,15 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
}
}

hmer &= ~SPR_HMER_PROC_RECV_DONE;
handled |= SPR_HMER_PROC_RECV_DONE;
if (hmi_evt) {
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
queue_hmi_event(hmi_evt, recover);
}
}
if (hmer & SPR_HMER_PROC_RECV_ERROR_MASKED) {
hmer &= ~SPR_HMER_PROC_RECV_ERROR_MASKED;
handled |= SPR_HMER_PROC_RECV_ERROR_MASKED;
if (hmi_evt) {
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_MASKED;
Expand All @@ -1191,7 +1191,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
hmi_print_debug("Processor recovery Done (masked).", hmer);
}
if (hmer & SPR_HMER_PROC_RECV_AGAIN) {
hmer &= ~SPR_HMER_PROC_RECV_AGAIN;
handled |= SPR_HMER_PROC_RECV_AGAIN;
if (hmi_evt) {
hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE_AGAIN;
Expand All @@ -1202,7 +1202,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
}
/* Assert if we see malfunction alert, we can not continue. */
if (hmer & SPR_HMER_MALFUNCTION_ALERT) {
hmer &= ~SPR_HMER_MALFUNCTION_ALERT;
handled |= SPR_HMER_MALFUNCTION_ALERT;

hmi_print_debug("Malfunction Alert", hmer);
if (hmi_evt)
Expand All @@ -1211,7 +1211,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)

/* Assert if we see Hypervisor resource error, we can not continue. */
if (hmer & SPR_HMER_HYP_RESOURCE_ERR) {
hmer &= ~SPR_HMER_HYP_RESOURCE_ERR;
handled |= SPR_HMER_HYP_RESOURCE_ERR;

hmi_print_debug("Hypervisor resource error", hmer);
recover = 0;
Expand All @@ -1228,10 +1228,10 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
*/
if (hmer & SPR_HMER_TFAC_ERROR) {
tfmr = mfspr(SPR_TFMR); /* save original TFMR */
handled |= SPR_HMER_TFAC_ERROR;

hmi_print_debug("Timer Facility Error", hmer);

hmer &= ~SPR_HMER_TFAC_ERROR;
recover = chiptod_recover_tb_errors();
if (hmi_evt) {
hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
Expand All @@ -1242,7 +1242,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
}
if (hmer & SPR_HMER_TFMR_PARITY_ERROR) {
tfmr = mfspr(SPR_TFMR); /* save original TFMR */
hmer &= ~SPR_HMER_TFMR_PARITY_ERROR;
handled |= SPR_HMER_TFMR_PARITY_ERROR;

hmi_print_debug("TFMR parity Error", hmer);
recover = chiptod_recover_tb_errors();
Expand All @@ -1259,9 +1259,11 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
/*
* HMER bits are sticky, once set to 1 they remain set to 1 until
* they are set to 0. Reset the error source bit to 0, otherwise
* we keep getting HMI interrupt again and again.
* we keep getting HMI interrupt again and again. Writing to HMER
* acts as an AND, so we write mask of all 1's except for the bits
* we want to clear.
*/
mtspr(SPR_HMER, hmer);
mtspr(SPR_HMER, ~handled);
hmi_exit();
/* Set the TB state looking at TFMR register before we head out. */
cpu->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
Expand Down

0 comments on commit a9d92e2

Please sign in to comment.