Skip to content

Commit

Permalink
opal/hmi: Handle early HMIs on thread0 when secondaries are still in …
Browse files Browse the repository at this point in the history
…OPAL.

When primary thread receives a CORE level HMI for timer facility errors
while secondaries are still in OPAL, thread 0 ends up in rendez-vous
waiting for secondaries to get into hmi handling. This is because OPAL
runs with MSR(EE=0) and hence HMIs are delayed on secondary threads until
they are given to Linux OS. Fix this by adding a check for secondary
state and force them in hmi handling by queuing job on secondary threads.

I have tested this by injecting HDEC parity error very early during Linux
kernel boot. Recovery works fine for non-TB errors. But if TB is bad at
this very eary stage we already doomed.

Without this patch we see:

[  285.046347408,7] OPAL: Start CPU 0x0843 (PIR 0x0843) -> 0x000000000000a83c
[  285.051160609,7] OPAL: Start CPU 0x0844 (PIR 0x0844) -> 0x000000000000a83c
[  285.055359021,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[  285.055361439,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:0: TFMR(2e12002870e14000) Timer Facility Error
[  286.232183823,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 1 (sptr=0000ccc1)
[  287.409002056,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 2 (sptr=0000ccc1)
[  289.073820164,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 3 (sptr=0000ccc1)
[  290.250638683,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 1 (sptr=0000ccc2)
[  291.427456821,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 2 (sptr=0000ccc2)
[  293.092274807,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 3 (sptr=0000ccc2)
[  294.269092904,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 1 (sptr=0000ccc3)
[  295.445910944,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 2 (sptr=0000ccc3)
[  297.110728970,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 3 (sptr=0000ccc3)

After this patch:

[  259.401719351,7] OPAL: Start CPU 0x0841 (PIR 0x0841) -> 0x000000000000a83c
[  259.406259572,7] OPAL: Start CPU 0x0842 (PIR 0x0842) -> 0x000000000000a83c
[  259.410615534,7] OPAL: Start CPU 0x0843 (PIR 0x0843) -> 0x000000000000a83c
[  259.415444519,7] OPAL: Start CPU 0x0844 (PIR 0x0844) -> 0x000000000000a83c
[  259.419641401,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[  259.419644124,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:0: TFMR(2e12002870e04000) Timer Facility Error
[  259.419650678,7] HMI: Sending hmi job to thread 1
[  259.419652744,7] HMI: Sending hmi job to thread 2
[  259.419653051,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[  259.419654725,7] HMI: Sending hmi job to thread 3
[  259.419654916,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[  259.419658025,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[  259.419658406,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:2: TFMR(2e12002870e04000) Timer Facility Error
[  259.419663095,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:3: TFMR(2e12002870e04000) Timer Facility Error
[  259.419655234,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:1: TFMR(2e12002870e04000) Timer Facility Error
[  259.425109779,7] OPAL: Start CPU 0x0845 (PIR 0x0845) -> 0x000000000000a83c
[  259.429870681,7] OPAL: Start CPU 0x0846 (PIR 0x0846) -> 0x000000000000a83c
[  259.434549250,7] OPAL: Start CPU 0x0847 (PIR 0x0847) -> 0x000000000000a83c

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Signed-off-by: Stewart Smith <stewart@linux.ibm.com>
  • Loading branch information
maheshsal authored and stewartsmith committed Sep 27, 2018
1 parent 1355c31 commit c884f2d
Showing 1 changed file with 49 additions and 0 deletions.
49 changes: 49 additions & 0 deletions core/hmi.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <npu.h>
#include <capp.h>
#include <nvram.h>
#include <cpu.h>

/*
* HMER register layout:
Expand Down Expand Up @@ -966,14 +967,54 @@ static int handle_thread_tfac_error(uint64_t tfmr, uint64_t *out_flags)
return recover;
}

static int64_t opal_handle_hmi(void);

static void opal_handle_hmi_job(void *data __unused)
{
opal_handle_hmi();
}

/*
* Queue hmi handling job If secondaries are still in OPAL
* This function is called by thread 0.
*/
static struct cpu_job **hmi_kick_secondaries(void)
{
struct cpu_thread *ts = this_cpu();
struct cpu_job **hmi_jobs = NULL;
int job_sz = sizeof(struct cpu_job *) * cpu_thread_count;
int i;

for (i = 1; i < cpu_thread_count; i++) {
ts = next_cpu(ts);

/* Is this thread still in OPAL ? */
if (ts->state == cpu_state_active) {
if (!hmi_jobs) {
hmi_jobs = zalloc(job_sz);
assert(hmi_jobs);
}

prlog(PR_DEBUG, "Sending hmi job to thread %d\n", i);
hmi_jobs[i] = cpu_queue_job(ts, "handle_hmi_job",
opal_handle_hmi_job, NULL);
}
}
return hmi_jobs;
}

static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
{
struct cpu_thread *t, *t0;
int recover = -1;
struct cpu_job **hmi_jobs = NULL;

t = this_cpu();
t0 = find_cpu_by_pir(cpu_get_thread0(t));

if (t == t0 && t0->state == cpu_state_os)
hmi_jobs = hmi_kick_secondaries();

/* Rendez vous all threads */
hmi_rendez_vous(1);

Expand Down Expand Up @@ -1055,6 +1096,14 @@ static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
if (t0->tb_resynced)
*out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;

if (t == t0 && hmi_jobs) {
int i;
for (i = 1; i < cpu_thread_count; i++)
if (hmi_jobs[i])
cpu_wait_job(hmi_jobs[i], true);
free(hmi_jobs);
}

return recover;
}

Expand Down

0 comments on commit c884f2d

Please sign in to comment.