Skip to content

Commit

Permalink
HTMGT: Config data changes for GPU support
Browse files Browse the repository at this point in the history
Change-Id: I2b4a5a82791ee6c4531d102dad51389f9dedbe6c
RTC: 133828
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45480
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Reviewed-by: Sheldon R. Bailey <baileysh@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
  • Loading branch information
cjcain authored and dcrowell77 committed Sep 7, 2017
1 parent a644d89 commit 87ff275
Show file tree
Hide file tree
Showing 10 changed files with 352 additions and 9 deletions.
163 changes: 162 additions & 1 deletion src/usr/htmgt/htmgt_cfgdata.C
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,12 @@ namespace HTMGT
cmdDataLen );
break;

case OCC_CFGDATA_GPU_CONFIG:
getGPUConfigMessageData(occ->getTarget(),
cmdData,
cmdDataLen);
break;

default:
TMGT_ERR("sendOccConfigData: Unsupported"
" format type 0x%02X",
Expand Down Expand Up @@ -932,7 +938,8 @@ void getThermalControlMessageData(uint8_t* o_data,
l_numSets++;

// VRM
l_timeout = l_sys->getAttr<ATTR_OPEN_POWER_VRM_READ_TIMEOUT_SEC>();
if (!l_sys->tryGetAttr<ATTR_OPEN_POWER_VRM_READ_TIMEOUT_SEC>(l_timeout))
l_timeout = 0;
if (l_timeout != 0)
{
o_data[index++] = CFGDATA_FRU_TYPE_VRM;
Expand All @@ -944,6 +951,51 @@ void getThermalControlMessageData(uint8_t* o_data,
l_numSets++;
}

// GPU Cores
if (!l_sys->tryGetAttr<ATTR_OPEN_POWER_GPU_READ_TIMEOUT_SEC>(l_timeout))
l_timeout = 0xFF;
if (l_timeout == 0)
{
l_timeout = 0xFF;
}
if (!l_sys->
tryGetAttr<ATTR_OPEN_POWER_GPU_ERROR_TEMP_DEG_C>(l_ERR_temp))
l_ERR_temp = OCC_NOT_DEFINED;
if (l_ERR_temp == 0)
{
l_ERR_temp = OCC_NOT_DEFINED;
}
o_data[index++] = CFGDATA_FRU_TYPE_GPU_CORE;
o_data[index++] = OCC_NOT_DEFINED; //DVFS
o_data[index++] = l_ERR_temp; //ERROR
o_data[index++] = OCC_NOT_DEFINED; //PM_DVFS
o_data[index++] = OCC_NOT_DEFINED; //PM_ERROR
o_data[index++] = l_timeout;
l_numSets++;

// GPU Memory
if (!l_sys->
tryGetAttr<ATTR_OPEN_POWER_GPU_MEM_READ_TIMEOUT_SEC>(l_timeout))
l_timeout = 0xFF;
if (l_timeout == 0)
{
l_timeout = 0xFF;
}
if (!l_sys->
tryGetAttr<ATTR_OPEN_POWER_GPU_MEM_ERROR_TEMP_DEG_C>(l_ERR_temp))
l_ERR_temp = OCC_NOT_DEFINED;
if (l_ERR_temp == 0)
{
l_ERR_temp = OCC_NOT_DEFINED;
}
o_data[index++] = CFGDATA_FRU_TYPE_GPU_MEMORY;
o_data[index++] = OCC_NOT_DEFINED; //DVFS
o_data[index++] = l_ERR_temp; //ERROR
o_data[index++] = OCC_NOT_DEFINED; //PM_DVFS
o_data[index++] = OCC_NOT_DEFINED; //PM_ERROR
o_data[index++] = l_timeout;
l_numSets++;

o_data[l_numSetsOffset] = l_numSets;
o_size = index;

Expand Down Expand Up @@ -975,9 +1027,118 @@ void getAVSBusConfigMessageData( const TargetHandle_t i_occ,
o_data[index++] = 0xFF; //reserved
o_data[index++] = 0xFF; //reserved
o_size = index;

}


// Send config data required by OCC for GPU handling.
// The OCC will determine which GPUs are present from the APSS GPIOs.
void getGPUConfigMessageData(const TargetHandle_t i_occ,
uint8_t * o_data,
uint64_t & o_size)
{
unsigned int index = 0;
assert(o_data != nullptr);

// Get system and proc target
Target* sys = nullptr;
targetService().getTopLevelTarget(sys);
assert(sys != nullptr);
ConstTargetHandle_t proc = getParentChip(i_occ);
assert(proc != nullptr);

// Populate the data
o_data[index++] = OCC_CFGDATA_GPU_CONFIG;
o_data[index++] = 0x01; // GPU Config Version

uint16_t power = 0;
power = sys->getAttr<ATTR_CALCULATED_MAX_SYS_POWER_EXCLUDING_GPUS>();
//uint16_t miscpwr =
// sys->getAttr<ATTR_MISC_SYSTEM_COMPONENTS_MAX_POWER_WATTS>();
UINT16_PUT(&o_data[index], power); // Total non-GPU max power (W)
index += 2;

power = sys->getAttr<ATTR_CALCULATED_PROC_MEMORY_POWER_DROP>();
UINT16_PUT(&o_data[index], power); // Total proc/mem power drop (W)
index += 2;
o_data[index++] = 0; // reserved
o_data[index++] = 0;

uint32_t gpu_func_sensors[MAX_GPUS] = {0};
uint32_t gpu_temp_sensors[MAX_GPUS] = {0};
uint32_t gpu_memtemp_sensors[MAX_GPUS] = {0};
// Read GPU sensor numbers
uint8_t num_sensors = 0;
errlHndl_t err = nullptr;
err = SENSOR::getGpuSensors(const_cast<TARGETING::TargetHandle_t>(proc),
HWAS::GPU_FUNC_SENSOR,
num_sensors, gpu_func_sensors);
if (err)
{
TMGT_ERR("getGPUConfigMessageData: getGpuSensors(GPU_FUNC_SENSOR)"
" failed with rc 0x%04X", err->reasonCode());
ERRORLOG::errlCommit(err, HTMGT_COMP_ID);
memset(gpu_func_sensors, 0, sizeof(gpu_func_sensors));
}
err = SENSOR::getGpuSensors(const_cast<TARGETING::TargetHandle_t>(proc),
HWAS::GPU_TEMPERATURE_SENSOR,
num_sensors, gpu_temp_sensors);
if (err)
{
TMGT_ERR("getGPUConfigMessageData: getGpuSensors(GPU_TEMP_SENSOR)"
" failed with rc 0x%04X", err->reasonCode());
ERRORLOG::errlCommit(err, HTMGT_COMP_ID);
memset(gpu_temp_sensors, 0, sizeof(gpu_temp_sensors));
}
err = SENSOR::getGpuSensors(const_cast<TARGETING::TargetHandle_t>(proc),
HWAS::GPU_MEMORY_TEMP_SENSOR,
num_sensors, gpu_memtemp_sensors);
if (err)
{
TMGT_ERR("getGPUConfigMessageData: getGpuSensors(GPU_MEM_TEMP_SENSOR)"
" failed with rc 0x%04X", err->reasonCode());
ERRORLOG::errlCommit(err, HTMGT_COMP_ID);
memset(gpu_memtemp_sensors, 0, sizeof(gpu_memtemp_sensors));
}
for (unsigned int index = 0; index < MAX_GPUS; ++index)
{
if (gpu_func_sensors[index] == TARGETING::UTIL::INVALID_IPMI_SENSOR)
gpu_func_sensors[index] = 0;
if (gpu_temp_sensors[index] == TARGETING::UTIL::INVALID_IPMI_SENSOR)
gpu_temp_sensors[index] = 0;
if (gpu_memtemp_sensors[index] == TARGETING::UTIL::INVALID_IPMI_SENSOR)
gpu_memtemp_sensors[index] = 0;
}

// GPU0
UINT32_PUT(&o_data[index], gpu_temp_sensors[0]);
index += 4;
UINT32_PUT(&o_data[index], gpu_memtemp_sensors[0]);
index += 4;
UINT32_PUT(&o_data[index], gpu_func_sensors[0]);
index += 4;

// GPU1
UINT32_PUT(&o_data[index], gpu_temp_sensors[1]);
index += 4;
UINT32_PUT(&o_data[index], gpu_memtemp_sensors[1]);
index += 4;
UINT32_PUT(&o_data[index], gpu_func_sensors[1]);
index += 4;

// GPU2
UINT32_PUT(&o_data[index], gpu_temp_sensors[2]);
index += 4;
UINT32_PUT(&o_data[index], gpu_memtemp_sensors[2]);
index += 4;
UINT32_PUT(&o_data[index], gpu_func_sensors[2]);
index += 4;

o_size = index;

} // end getGPUConfigMessageData()



void getFrequencyPointMessageData(uint8_t* o_data,
uint64_t & o_size)
Expand Down
25 changes: 21 additions & 4 deletions src/usr/htmgt/htmgt_cfgdata.H
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <targeting/common/target.H>
#include "htmgt_occ.H"

#define MAX_GPUS 3

namespace HTMGT
{
Expand All @@ -53,6 +54,7 @@ namespace HTMGT
OCC_CFGDATA_MEM_THROTTLE = 0x12, // Memory Throttle Settings
OCC_CFGDATA_TCT_CONFIG = 0x13, // Thermal Control Treshold
OCC_CFGDATA_AVSBUS_CONFIG = 0x14, // AVSBus Config
OCC_CFGDATA_GPU_CONFIG = 0x15, // GPU Config

OCC_CFGDATA_FORMAT_END, // Marker to indicate last entry
OCC_CFGDATA_CLEAR_ALL = 0xFF, // Clear All Active Config Data
Expand All @@ -68,10 +70,12 @@ namespace HTMGT

CFGDATA_CORES = 24,

CFGDATA_FRU_TYPE_PROC = 0x00,
CFGDATA_FRU_TYPE_MEMBUF = 0x01,
CFGDATA_FRU_TYPE_DIMM = 0x02,
CFGDATA_FRU_TYPE_VRM = 0x03,
CFGDATA_FRU_TYPE_PROC = 0x00,
CFGDATA_FRU_TYPE_MEMBUF = 0x01,
CFGDATA_FRU_TYPE_DIMM = 0x02,
CFGDATA_FRU_TYPE_VRM = 0x03,
CFGDATA_FRU_TYPE_GPU_CORE = 0x04,
CFGDATA_FRU_TYPE_GPU_MEMORY = 0x05,

CFDATA_DVFS_NOT_DEFINED = 0xFF,
};
Expand Down Expand Up @@ -116,6 +120,7 @@ namespace HTMGT
{ OCC_CFGDATA_MEM_THROTTLE, TARGET_ALL, TO_20SEC, CFGSTATE_ALL },
{ OCC_CFGDATA_TCT_CONFIG, TARGET_ALL, TO_20SEC, CFGSTATE_ALL },
{ OCC_CFGDATA_AVSBUS_CONFIG, TARGET_ALL, TO_20SEC, CFGSTATE_ALL },
{ OCC_CFGDATA_GPU_CONFIG, TARGET_ALL, TO_20SEC, CFGSTATE_ALL },
};
const size_t OCC_CONFIG_TABLE_SIZE = sizeof(occCfgDataTable) /
sizeof(occCfgDataTable_t);
Expand Down Expand Up @@ -226,6 +231,18 @@ namespace HTMGT
void getAVSBusConfigMessageData(const TARGETING::TargetHandle_t i_occ,
uint8_t* o_data, uint64_t & o_size);

/**
* Fills in the GPU Configuration Data message buffer
*
* @param[in] i_occ - the OCC target
* @param[out] o_data - preallocated buffer to fill in
* @param[out] o_size - set to the message size
* @pre o_data is large enough.
*/
void getGPUConfigMessageData(const TARGETING::TargetHandle_t i_occ,
uint8_t * o_data,
uint64_t & o_size);

/**
* Fill in the Frequency Point Configuration Data
* message buffer.
Expand Down
34 changes: 32 additions & 2 deletions src/usr/htmgt/htmgt_occ.C
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ namespace HTMGT
iv_target(i_target),
iv_lastPollValid(false),
iv_occsPresent(1 << i_instance),
iv_gpuCfg(0),
iv_resetReason(OCC_RESET_REASON_NONE),
iv_exceptionLogged(0),
iv_resetCount(0),
Expand Down Expand Up @@ -363,6 +364,31 @@ namespace HTMGT
OCC_TRACE_INF );
}

// Notify HostBoot which GPUs are present (after OCC goes active)
void Occ::updateGpuPresence()
{
TARGETING::ConstTargetHandle_t const_proc_target =
TARGETING::getParentChip(iv_target);
SENSOR::StatusSensor::statusEnum gpu_status[MAX_GPUS] =
{
SENSOR::StatusSensor::NOT_PRESENT,
SENSOR::StatusSensor::NOT_PRESENT,
SENSOR::StatusSensor::NOT_PRESENT
};
if (iv_gpuCfg & GPUCFG_GPU0_PRESENT)
gpu_status[0] = SENSOR::StatusSensor::PRESENT;
if (iv_gpuCfg & GPUCFG_GPU1_PRESENT)
gpu_status[1] = SENSOR::StatusSensor::PRESENT;
if (iv_gpuCfg & GPUCFG_GPU2_PRESENT)
gpu_status[2] = SENSOR::StatusSensor::PRESENT;

TMGT_INF("updateGpuPresence: OCC%d - GPU0:%d, GPU1:%d, GPU2:%d",
iv_instance, gpu_status[0], gpu_status[1], gpu_status[2]);
SENSOR::updateGpuSensorStatus(const_cast<TARGETING::TargetHandle_t>
(const_proc_target),
gpu_status);
}


/////////////////////////////////////////////////////////////////

Expand Down Expand Up @@ -752,7 +778,12 @@ namespace HTMGT
// Make sure all OCCs went to active state
for( const auto & occ : iv_occArray )
{
if (requestedState != occ->getState())
if (requestedState == occ->getState())
{
// Update GPU present status
occ->updateGpuPresence();
}
else
{
TMGT_ERR("_setOccState: OCC%d is not in 0x%02X "
"state",
Expand Down Expand Up @@ -797,7 +828,6 @@ namespace HTMGT
"CHARACTERIZATION state");
}
}

}
}
}
Expand Down
7 changes: 7 additions & 0 deletions src/usr/htmgt/htmgt_occ.H
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,11 @@ namespace HTMGT
const occErrlCallout_t i_callout,
uint8_t & io_callout_num);

/**
* @brief Update the GPU presence sensors in the system
*/
void updateGpuPresence();

protected:
// Instance number of this OCC: 0 = first physical OCC
uint8_t iv_instance;
Expand Down Expand Up @@ -366,6 +371,8 @@ namespace HTMGT
bool iv_lastPollValid;
// expected occsPresent byte in POLL response
uint8_t iv_occsPresent;
// GPU configuration from poll response data
uint8_t iv_gpuCfg;

occResetReason iv_resetReason;

Expand Down
8 changes: 8 additions & 0 deletions src/usr/htmgt/htmgt_poll.C
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,14 @@ namespace HTMGT
iv_instance, state_string(iv_state));
}

// Check GPU config
if (iv_gpuCfg != pollRsp->gpuCfg)
{
iv_gpuCfg = pollRsp->gpuCfg;
TMGT_INF("pollRspHandler: updating OCC%d GPU config to 0x%02X",
iv_instance, iv_gpuCfg);
}

// Copy rspData to lastPollResponse
memcpy(iv_lastPollResponse, pollRsp, OCC_POLL_DATA_MIN_SIZE);
iv_lastPollValid = true;
Expand Down
11 changes: 10 additions & 1 deletion src/usr/htmgt/htmgt_poll.H
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ namespace HTMGT
const uint8_t OCC_XSTATUS_MEM_THROT_OT = 0x20;
const uint8_t OCC_XSTATUS_N_POWER = 0x10;

// GPU Config bits
enum gpuConfig_e
{
GPUCFG_GPU2_PRESENT = 0x04,
GPUCFG_GPU1_PRESENT = 0x02,
GPUCFG_GPU0_PRESENT = 0x01
};

struct occPollRspStruct_t
{
uint8_t status;
Expand All @@ -59,7 +67,8 @@ namespace HTMGT
uint8_t errorId;
uint32_t errorAddress;
uint16_t errorLength;
uint16_t reserved[2];
uint8_t reserved;
uint8_t gpuCfg;
uint8_t codeLevel[16];
uint8_t sensor[6];
uint8_t numBlocks;
Expand Down
7 changes: 7 additions & 0 deletions src/usr/htmgt/occError.C
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,13 @@ namespace HTMGT
l_success = false;
}
}
else if (i_callout.type == OCC_CALLOUT_TYPE_GPU_SENSOR)
{
const uint32_t sensor = (uint32_t)i_callout.calloutValue;
io_errlHndl->addSensorCallout(sensor, HWAS::GPU_FUNC_SENSOR,
i_priority);
io_callout_num++;
}
else
{
TMGT_ERR("elogAddCallout: Invalid callout type (type=%d)",
Expand Down

0 comments on commit 87ff275

Please sign in to comment.