Skip to content

Commit

Permalink
Add retry to slave core wakeup path
Browse files Browse the repository at this point in the history
We are still seeing some very intermittent errors in the slave
core wakeup path.  It still seems like we may have a timing issue.
Until we figure out exactly what is going on, I am adding a retry
mechanism that should get the core to report in correctly.  The
retry is done by issuing an additional doorbell message to the
core that didn't report in.

Change-Id: Ib87e5d58e079674d1eebb44c10d0252a35ea0519
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/70762
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
  • Loading branch information
dcrowell77 committed Jan 25, 2019
1 parent 2fcb5a9 commit dad96d1
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 7 deletions.
11 changes: 10 additions & 1 deletion src/include/kernel/cpumgr.H
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2010,2017 */
/* Contributors Listed Below - COPYRIGHT 2010,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -146,6 +146,15 @@ class CpuManager
static void startCore(uint64_t pir,uint64_t i_threads);


/** @fn wakeupCore
* Start the core, can only be run after startCore.
*
* @param[in] pir - PIR value of first thread in core.
* @param[in] i_threads - Bitstring of threads to enable (left-justified).
*/
static void wakeupCore(uint64_t pir,uint64_t i_threads);


/** @fn forceMemoryPeriodic()
* Force the memory free / coalesce operations to be performed on the
* next "periodic" interval.
Expand Down
4 changes: 3 additions & 1 deletion src/include/kernel/syscalls.H
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2010,2018 */
/* Contributors Listed Below - COPYRIGHT 2010,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -99,6 +99,8 @@ namespace Systemcalls
MISC_CPUNAP,
/** cpu_master_winkle() */
MISC_CPUWINKLE,
/** cpu_wakeup_core() */
MISC_CPUWAKEUPCORE,

/** mm_alloc_block() */
MM_ALLOC_BLOCK,
Expand Down
18 changes: 17 additions & 1 deletion src/include/sys/misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2011,2018 */
/* Contributors Listed Below - COPYRIGHT 2011,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -239,6 +239,22 @@ int cpu_master_winkle(bool i_fusedCores);
*/
int cpu_all_winkle();

/** @fn cpu_wakeup_core
* @brief Have the kernel wakeup a core that was previously started.
*
* @param[in] pir - PIR value of the first thread on the core.
* @param[in] i_threads - Bitstring of threads to enable (left-justified).
*
* @note The kernel will wakeup all threads on the requested core even
* though the callee only requests with a single PIR value.
*
* @return 0 or -(errno) on failure.
*
* @retval -ENXIO - The core ID was outside of the range the kernel is
* prepared to support.
*/
int cpu_wakeup_core(uint64_t pir,uint64_t i_threads);

/** @fn cpu_crit_assert
* @brief Forces a Terminate Immediate after a crit-assert is issued
* @param[in] i_failAddr - value in the linkRegister of the address
Expand Down
37 changes: 35 additions & 2 deletions src/kernel/cpumgr.C
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2010,2018 */
/* Contributors Listed Below - COPYRIGHT 2010,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -446,7 +446,7 @@ void CpuManager::startCore(uint64_t pir,uint64_t i_threads)
// Only wakeup the threads we were told to wakeup
if( i_threads & (0x8000000000000000 >> i) )
{
printk("Dbell pir 0x%lx\n", pir + i);
printk("Dbell:0x%lx\n", pir + i);
//Initiate the Doorbell for this core/pir
send_doorbell_wakeup(pir + i);
}
Expand All @@ -455,6 +455,39 @@ void CpuManager::startCore(uint64_t pir,uint64_t i_threads)
return;
};

void CpuManager::wakeupCore(uint64_t pir,uint64_t i_threads)
{
size_t threads = getThreadCount();
pir = pir & ~(threads-1);

if (pir >=
(KERNEL_MAX_SUPPORTED_NODES * KERNEL_MAX_SUPPORTED_CPUS_PER_NODE))
{
TASK_SETRTN(TaskManager::getCurrentTask(), -ENXIO);
return;
}

//Send a message to userspace that a core with this base pir is being added
// userspace will know which threads on the core to expect already
InterruptMsgHdlr::addCpuCore(pir);

// Physically wakeup the threads with doorbells
// Assumption is that startCore has already run so all
// internal structures are setup
for(size_t i = 0; i < threads; i++)
{
// Only wakeup the threads we were told to wakeup
if( i_threads & (0x8000000000000000 >> i) )
{
printk("Dbell2:0x%lx\n", pir + i);
//Initiate the Doorbell for this core/pir
doorbell_send(pir + i);
}
}

return;
};

size_t CpuManager::getThreadCount()
{
size_t threads = 0;
Expand Down
11 changes: 11 additions & 0 deletions src/kernel/syscall.C
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ namespace KernelIpc
extern "C"
void kernel_execute_hype_doorbell()
{
printkd("hyp_doorbell on %lx\n", getPIR());

// Per POWER ISA Section 5.9.2, to avoid any weak consistency
// issues we must use a msgsync instruction before consuming
// any data set by a different thread following a doorbell
Expand Down Expand Up @@ -141,6 +143,7 @@ namespace Systemcalls
void CpuSprSet(task_t *t);
void CpuNap(task_t *t);
void CpuWinkle(task_t *t);
void CpuWakeupCore(task_t *t);
void MmAllocBlock(task_t *t);
void MmRemovePages(task_t *t);
void MmSetPermission(task_t *t);
Expand Down Expand Up @@ -184,6 +187,7 @@ namespace Systemcalls
&CpuSprSet, // MISC_CPUSPRSET
&CpuNap, // MISC_CPUNAP
&CpuWinkle, // MISC_CPUWINKLE
&CpuWakeupCore, // MISC_CPUWAKEUPCORE

&MmAllocBlock, // MM_ALLOC_BLOCK
&MmRemovePages, // MM_REMOVE_PAGES
Expand Down Expand Up @@ -848,6 +852,13 @@ namespace Systemcalls
}
}

/** Force thread wakeup via doorbell. */
void CpuWakeupCore(task_t *t)
{
CpuManager::wakeupCore(static_cast<uint64_t>(TASK_GETARG0(t)),
static_cast<uint64_t>(TASK_GETARG1(t)));
};

/**
* Allocate a block of virtual memory within the base segment
* @param[in] t: The task used to allocate a block in the base segment
Expand Down
10 changes: 9 additions & 1 deletion src/lib/syscall_misc.C
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2011,2018 */
/* Contributors Listed Below - COPYRIGHT 2011,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -132,6 +132,14 @@ int cpu_all_winkle()
return rc;
}

int cpu_wakeup_core(uint64_t pir,uint64_t i_threads)
{
return reinterpret_cast<int64_t>(
_syscall2(MISC_CPUWAKEUPCORE,
reinterpret_cast<void*>(pir),
reinterpret_cast<void*>(i_threads)));
}


void cpu_crit_assert(uint64_t i_failAddr)
{
Expand Down
17 changes: 16 additions & 1 deletion src/usr/isteps/istep16/call_host_activate_slave_cores.C
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2015,2018 */
/* Contributors Listed Below - COPYRIGHT 2015,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -122,6 +122,18 @@ void* call_host_activate_slave_cores (void *io_pArgs)

int rc = cpu_start_core(pir, en_threads);

// Workaround to handle some syncing issues with new cpus
// waking
if (-ETIME == rc)
{
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
"call_host_activate_slave_cores: "
"Time out rc from kernel %d on core 0x%x, resending doorbell",
rc,
pir);
rc = cpu_wakeup_core(pir,en_threads);
}

// Handle time out error
uint32_t l_checkidle_eid = 0;
if (-ETIME == rc)
Expand Down Expand Up @@ -208,6 +220,9 @@ void* call_host_activate_slave_cores (void *io_pArgs)
// Throw printk in there too in case it is a kernel issue
ERRORLOG::ErrlUserDetailsPrintk().addToLog(l_errl);

// Add interesting ISTEP traces
l_errl->collectTrace(ISTEP_COMP_NAME,256);

l_stepError.addErrorDetails( l_errl );
errlCommit( l_errl, HWPF_COMP_ID );
break;
Expand Down

0 comments on commit dad96d1

Please sign in to comment.