Skip to content

Commit

Permalink
Calculate GPU Power Cap
Browse files Browse the repository at this point in the history
Define GPU ID callout type

Change-Id: I99e691abe64fc0d706571fc7a128d565159e0461
RTC: 133823
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45077
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
  • Loading branch information
marthabroyles authored and cjcain committed Aug 29, 2017
1 parent c34f286 commit e1a597e
Show file tree
Hide file tree
Showing 9 changed files with 242 additions and 44 deletions.
51 changes: 32 additions & 19 deletions src/occ_405/amec/amec_master_smh.c
Expand Up @@ -39,6 +39,7 @@
#include "amec_service_codes.h" //For AMEC_MST_CHECK_PCAPS_MATCH
#include "dcom.h"
#include <amec_sensors_power.h>
#include <cmdh_fsp_cmds.h> // For G_apss_ch_to_function

//*************************************************************************/
// Externs
Expand All @@ -55,9 +56,12 @@
//Power cap mismatch threshold set to 8 ticks
#define PCAPS_MISMATCH_THRESHOLD 8

//Power cap failure threshold set to 32 ticks
//Power cap failure threshold with no GPUs set to 32 ticks
#define PCAP_FAILURE_THRESHOLD 32

//Power cap failure threshold with GPUs set to number of ticks for 100ms
#define PCAP_GPU_FAILURE_THRESHOLD (100000 / MICS_PER_TICK)

//*************************************************************************/
// Structures
//*************************************************************************/
Expand Down Expand Up @@ -90,6 +94,8 @@ uint16_t G_mst_soft_fmax = 0xFFFF;
//Counter of committed violations by the Slave OCCs
uint8_t G_mst_violation_cnt[MAX_OCCS] = {0};

extern uint32_t G_first_num_gpus_sys;

// --------------------------------------------------------
// AMEC Master State 5 Substate Table
// --------------------------------------------------------
Expand Down Expand Up @@ -390,6 +396,8 @@ void amec_mst_check_under_pcap(void)
/* Local Variables */
/*------------------------------------------------------------------------*/
errlHndl_t l_err = NULL;
uint8_t i = 0;
uint8_t l_apss_func_id = 0;

/*------------------------------------------------------------------------*/
/* Code */
Expand All @@ -406,21 +414,29 @@ void amec_mst_check_under_pcap(void)

G_over_cap_count++;

//Log error and reset OCC if count >= 32 (ticks)
if(G_over_cap_count >= PCAP_FAILURE_THRESHOLD)
// GPUs take longer for power limit to take effect if GPUs are present need to use
// a longer wait time before logging an error and resetting
if( ( (!G_first_num_gpus_sys) && (G_over_cap_count >= PCAP_FAILURE_THRESHOLD) ) ||
( (G_first_num_gpus_sys) && (G_over_cap_count >= PCAP_GPU_FAILURE_THRESHOLD) ) )
{
TRAC_ERR("Failure to maintain power cap: Power Cap = %d ,"
"PWRSYS = %d ,PWRPROC = %d ,PWRFAN = %d ,"
"PWRMEM = %d",g_amec->pcap.active_node_pcap,
AMECSENSOR_PTR(PWRSYS)->sample,
AMECSENSOR_PTR(PWRPROC)->sample,
AMECSENSOR_PTR(PWRFAN)->sample,
AMECSENSOR_PTR(PWRMEM)->sample);

TRAC_ERR("PWRIO = %d , PWRSTORE = %d, PWRGPU = %d",
AMECSENSOR_PTR(PWRIO)->sample,
AMECSENSOR_PTR(PWRSTORE)->sample,
AMECSENSOR_PTR(PWRGPU)->sample);
"PWRSYS = %d",g_amec->pcap.active_node_pcap,
AMECSENSOR_PTR(PWRSYS)->sample);

// Trace power per APSS channel to have the best breakdown for debug
for (i = 0; i < MAX_APSS_ADC_CHANNELS; i++)
{
l_apss_func_id = G_apss_ch_to_function[i];

if((l_apss_func_id != ADC_RESERVED) &&
(l_apss_func_id != ADC_12V_SENSE) &&
(l_apss_func_id != ADC_GND_REMOTE_SENSE) &&
(l_apss_func_id != ADC_12V_STANDBY_CURRENT) )
{
TRAC_ERR("APSS channel %d Function ID = %d Power = %dW", i, l_apss_func_id,
AMECSENSOR_PTR(PWRAPSSCH0 + i)->sample);
}
}

/* @
* @errortype
Expand Down Expand Up @@ -458,11 +474,8 @@ void amec_mst_check_under_pcap(void)
}
else
{
//Decrement count if node power under power cap value
if(G_over_cap_count > 0)
{
G_over_cap_count--;
}
// Clear counter
G_over_cap_count = 0;
}

return;
Expand Down
176 changes: 175 additions & 1 deletion src/occ_405/amec/amec_pcap.c
Expand Up @@ -58,7 +58,6 @@ extern PWR_READING_TYPE G_pwr_reading_type;
//ppb_fmax
#define PDROP_THRESH 0
//Number of MHz to raise the proc_pcap_vote for every watt of available power
//(DCM value should be less than SCM)
#define PROC_MHZ_PER_WATT 28
//Number of MHz to raise ppb_fmax per watt of available power. Depends on
//number of procs in node.
Expand All @@ -75,6 +74,8 @@ uint32_t G_mhz_per_pstate=0;
uint8_t G_over_pcap_count=0;

extern uint16_t G_proc_fmax_mhz; // max(turbo,uturbo) frequencies
extern uint32_t G_first_proc_gpu_config;
extern uint32_t G_first_num_gpus_sys;

//*************************************************************************/
// Function Prototypes
Expand All @@ -84,6 +85,170 @@ extern uint16_t G_proc_fmax_mhz; // max(turbo,uturbo) frequencies
// Functions
//*************************************************************************/

//////////////////////////
// Function Specification
//
// Name: amec_gpu_pcap
//
// Description: Determine power cap for GPUs
//
// Thread: Real Time Loop
//
// End Function Specification
void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power)
{
/*------------------------------------------------------------------------*/
/* Local Variables */
/*------------------------------------------------------------------------*/
uint8_t i = 0;
uint32_t l_gpu_cap_mw = 0;
static uint16_t L_total_gpu_pcap = 0; // Current total GPU pcap in effect
static uint16_t L_n_mode_gpu_total_pcap = 0; // Total GPU pcap required for oversubscription
static uint16_t L_active_psr_gpu_total_pcap = 0; // Total GPU pcap for the currently set pcap and PSR
static uint16_t L_per_gpu_pcap = 0; // Amount of L_total_gpu_pcap for each GPU
static uint8_t L_psr = 100; // PSR value used in L_active_psr_gpu_total_pcap calculation
static bool L_first_run = TRUE; // for calculations done only 1 time

/*------------------------------------------------------------------------*/
/* Code */
/*------------------------------------------------------------------------*/
// If this is the first time running calculate the total GPU power cap for oversubscription
if(L_first_run)
{
if(g_amec->pcap.ovs_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts)
{
// Take all non-GPU power away from the oversubscription power cap
L_n_mode_gpu_total_pcap = g_amec->pcap.ovs_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts;
// Add back in the power that will be dropped by processor DVFS and memory throttling and give to GPUs
L_n_mode_gpu_total_pcap += G_sysConfigData.total_proc_mem_pwr_drop_watts;
}
else
{
// This should not happen, the total non GPU power should never be higher than the N mode cap
// Log error and set GPUs to minimum power cap
L_n_mode_gpu_total_pcap = 0; // this will set minimum GPU power cap

TRAC_ERR("amec_gpu_pcap: non GPU max power %dW is more than N mode pwr limit %dW",
G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.ovs_node_pcap);

/* @
* @errortype
* @moduleid AMEC_GPU_PCAP_MID
* @reasoncode GPU_FAILURE
* @userdata1 N mode Power Cap watts
* @userdata2 Total non-GPU power watts
* @userdata4 ERC_GPU_N_MODE_PCAP_CALC_FAILURE
* @devdesc Total non-GPU power more than N mode power cap
*
*/
errlHndl_t l_err = createErrl(AMEC_GPU_PCAP_MID,
GPU_FAILURE,
ERC_GPU_N_MODE_PCAP_CALC_FAILURE,
ERRL_SEV_PREDICTIVE,
NULL,
DEFAULT_TRACE_SIZE,
g_amec->pcap.ovs_node_pcap,
G_sysConfigData.total_non_gpu_max_pwr_watts);

//Callout firmware
addCalloutToErrl(l_err,
ERRL_CALLOUT_TYPE_COMPONENT_ID,
ERRL_COMPONENT_ID_FIRMWARE,
ERRL_CALLOUT_PRIORITY_HIGH);
commitErrl(&l_err);
}
} // if first run

// Calculate the total GPU power cap for the current active limit and PSR
// this only needs to be calculated if either the active limit or PSR changed
if( (L_first_run) || (i_active_pcap_changed) || (L_psr != G_sysConfigData.psr) )
{
L_psr = G_sysConfigData.psr;
if(g_amec->pcap.active_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts)
{
// Take all non-GPU power away from the active power cap
L_active_psr_gpu_total_pcap = g_amec->pcap.active_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts;
// Add back in the power that will be dropped by processor DVFS and memory throttling based on the PSR
// to give to GPUs
L_active_psr_gpu_total_pcap += ( (L_psr / 100) * G_sysConfigData.total_proc_mem_pwr_drop_watts );
}
else
{
// Set GPUs to minimum power cap
L_active_psr_gpu_total_pcap = 0;
TRAC_IMP("amec_gpu_pcap: non GPU max power %dW is more than active pwr limit %dW",
G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.active_node_pcap);
}

// Total GPU power cap is the lower of oversubscription and active power limit
// must always account for oversubscription to ensure when a power supply is lost the OCC
// can react fast enough, GPU power capping is too slow and must have GPU power cap already
// set to account for oversubscription case
L_total_gpu_pcap = (L_n_mode_gpu_total_pcap < L_active_psr_gpu_total_pcap) ?
L_n_mode_gpu_total_pcap : L_active_psr_gpu_total_pcap;

// Divide the total equally across all GPUs in the system
if(G_first_num_gpus_sys)
{
L_per_gpu_pcap = L_total_gpu_pcap / G_first_num_gpus_sys;
}
else
{
L_per_gpu_pcap = 0;
TRAC_ERR("amec_gpu_pcap: Called with no GPUs present!");
}
}

// Setup to send new power limit to GPUs. The actual sending of GPU power limit will be handled by task_gpu_sm()
for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
{
// Before sending a GPU a power limit the power limits must be read from the GPU to know min/max GPU allows
if( GPU_PRESENT(i) && g_amec->gpu[i].pcap.pwr_limits_read )
{
l_gpu_cap_mw = L_per_gpu_pcap * 1000; // convert W to mW

// GPU is present and have min/max power limits from GPU
// clip the GPU power limit to min/max GPU limit if needed
if(l_gpu_cap_mw < g_amec->gpu[i].pcap.gpu_min_pcap_mw) // clip to min?
{
l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_min_pcap_mw;
}
else if(l_gpu_cap_mw > g_amec->gpu[i].pcap.gpu_max_pcap_mw) // clip to max?
{
l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_max_pcap_mw;
}

// If not already at the min then set to min if trying to reduce power and proc/memory are at min
if( (i_avail_power < 0) && (g_amec->proc[0].pwr_votes.ppb_fmax == g_amec->sys.fmin) &&
(g_amec->pcap.active_mem_level) && (l_gpu_cap_mw != g_amec->gpu[i].pcap.gpu_min_pcap_mw) )
{
l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_min_pcap_mw;
if(g_amec->gpu[i].pcap.gpu_desired_pcap_mw != l_gpu_cap_mw)
{
TRAC_ERR("amec_gpu_pcap: Forcing GPU%d to minimum pwr limit %dmW", i, l_gpu_cap_mw);
g_amec->gpu[i].pcap.gpu_min_cap_required = TRUE;
}
}

// check if this is a new power limit
if(g_amec->gpu[i].pcap.gpu_desired_pcap_mw != l_gpu_cap_mw)
{
TRAC_IMP("amec_gpu_pcap: Updating GPU%d desired pcap %dmW to %dmW", i,
g_amec->gpu[i].pcap.gpu_desired_pcap_mw, l_gpu_cap_mw);
g_amec->gpu[i].pcap.gpu_desired_pcap_mw = l_gpu_cap_mw;

if( (g_amec->gpu[i].pcap.gpu_min_cap_required) && (l_gpu_cap_mw != g_amec->gpu[i].pcap.gpu_min_pcap_mw) )
{
TRAC_ERR("amec_gpu_pcap: GPU%d no longer requires minimum pwr limit %dmW", i, g_amec->gpu[i].pcap.gpu_min_pcap_mw);
g_amec->gpu[i].pcap.gpu_min_cap_required = FALSE;
}
}
}
} // for each GPU

L_first_run = FALSE;
}


//////////////////////////
// Function Specification
Expand All @@ -101,6 +266,7 @@ void amec_pcap_calc(void)
/* Local Variables */
/*------------------------------------------------------------------------*/
bool l_oversub_state = 0;
bool l_active_pcap_changed = FALSE;
uint16_t l_node_pwr = AMECSENSOR_PTR(PWRSYS)->sample;
uint16_t l_p0_pwr = AMECSENSOR_PTR(PWRPROC)->sample;
int32_t l_avail_power = 0;
Expand Down Expand Up @@ -138,9 +304,17 @@ void amec_pcap_calc(void)

// set this pcap as valid (needed by master for comparison)
g_amec->pcap_valid = 1;
l_active_pcap_changed = TRUE;
}

l_avail_power = g_amec->pcap.active_node_pcap - l_node_pwr;

// Determine GPU power cap if there are GPUs present
if(G_first_proc_gpu_config)
{
amec_gpu_pcap(l_active_pcap_changed, l_avail_power);
}

if(l_node_pwr != 0)
{
l_proc_fraction = ((uint32_t)(l_p0_pwr) << 16)/l_node_pwr;
Expand Down
1 change: 1 addition & 0 deletions src/occ_405/amec/amec_service_codes.h
Expand Up @@ -66,6 +66,7 @@ enum occAmecModuleId
AMEC_CALC_DTS_SENSORS = AMEC_COMP_ID | 0x16,
AMEC_SET_FREQ_RANGE = AMEC_COMP_ID | 0x17,
AMEC_UPDATE_APSS_GPIO = AMEC_COMP_ID | 0x18,
AMEC_GPU_PCAP_MID = AMEC_COMP_ID | 0x19,
};

/*----------------------------------------------------------------------------*/
Expand Down
1 change: 1 addition & 0 deletions src/occ_405/amec/amec_sys.h
Expand Up @@ -449,6 +449,7 @@ typedef struct {
typedef struct {
bool check_pwr_limit; // Indicates if need to read power limits from GPU
bool pwr_limits_read; // Indicates if power limits were read i.e. have min/max
bool gpu_min_cap_required; // Indicates if power limits were read i.e. have min/max
uint32_t gpu_min_pcap_mw; // Min GPU power limit in mW read from the GPU
uint32_t gpu_max_pcap_mw; // Max GPU power limit in mW read from the GPU
uint32_t gpu_desired_pcap_mw; // AMEC determined pcap in mW to set
Expand Down
33 changes: 17 additions & 16 deletions src/occ_405/cmdh/cmdh_fsp_cmds.c
Expand Up @@ -52,6 +52,7 @@
#include "sensor_main_memory.h"
extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap;
extern bool G_vrm_thermal_monitoring;
extern uint32_t G_first_proc_gpu_config;

#include <gpe_export.h>
extern gpe_shared_data_t G_shared_gpe_data;
Expand Down Expand Up @@ -223,7 +224,11 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
l_poll_rsp->errl_id, l_poll_rsp->errl_length, l_poll_rsp->errl_address);
}

// Byte 15 - 16: reserved.
// Byte 15: reserved.

// Byte 16: GPU Configuration
l_poll_rsp->gpu_presence = (uint8_t)G_first_proc_gpu_config;

// Byte 17 - 32 (16 bytes): OCC level
memcpy( (void *) l_poll_rsp->occ_level, (void *) &G_occ_buildname[0], 16);

Expand Down Expand Up @@ -1165,21 +1170,17 @@ void cmdh_dbug_get_apss_data (const cmdh_fsp_cmd_t * i_cmd_ptr,
//Get the data for each channel individually and write it to
for (i = 0; i < MAX_APSS_ADC_CHANNELS; i++)
{

if(AMECSENSOR_PTR(PWRAPSSCH0 + i)->ipmi_sid != 0)
{
l_resp_ptr->ApssCh[i].gain = G_sysConfigData.apss_cal[i].gain;
l_resp_ptr->ApssCh[i].offset = G_sysConfigData.apss_cal[i].offset;
l_resp_ptr->ApssCh[i].raw = G_dcom_slv_inbox_rx.adc[i];
l_resp_ptr->ApssCh[i].calculated = AMECSENSOR_PTR(PWRAPSSCH0 + i)->sample;
l_resp_ptr->ApssCh[i].func = G_apss_ch_to_function[i];
l_resp_ptr->ApssCh[i].ipmi_sid = AMECSENSOR_PTR(PWRAPSSCH0 + i)->ipmi_sid;

TRAC_IMP("DBG__APSS Ch[%02d]: Raw[0x%04x], Offset[0x%08x], Gain[0x%08x],",
i, l_resp_ptr->ApssCh[i].raw, l_resp_ptr->ApssCh[i].offset, l_resp_ptr->ApssCh[i].gain);
TRAC_IMP(" Pwr[0x%04x], FuncID[0x%02x], IPMI_sensorID[0x%X]",
l_resp_ptr->ApssCh[i].calculated, l_resp_ptr->ApssCh[i].func, l_resp_ptr->ApssCh[i].ipmi_sid);
}
l_resp_ptr->ApssCh[i].gain = G_sysConfigData.apss_cal[i].gain;
l_resp_ptr->ApssCh[i].offset = G_sysConfigData.apss_cal[i].offset;
l_resp_ptr->ApssCh[i].raw = G_dcom_slv_inbox_rx.adc[i];
l_resp_ptr->ApssCh[i].calculated = AMECSENSOR_PTR(PWRAPSSCH0 + i)->sample;
l_resp_ptr->ApssCh[i].func = G_apss_ch_to_function[i];
l_resp_ptr->ApssCh[i].ipmi_sid = AMECSENSOR_PTR(PWRAPSSCH0 + i)->ipmi_sid;

TRAC_IMP("DBG__APSS Ch[%02d]: Raw[0x%04x], Offset[0x%08x], Gain[0x%08x],",
i, l_resp_ptr->ApssCh[i].raw, l_resp_ptr->ApssCh[i].offset, l_resp_ptr->ApssCh[i].gain);
TRAC_IMP(" Pwr[0x%04x], FuncID[0x%02x], IPMI_sensorID[0x%X]",
l_resp_ptr->ApssCh[i].calculated, l_resp_ptr->ApssCh[i].func, l_resp_ptr->ApssCh[i].ipmi_sid);
}

}while(0);
Expand Down
5 changes: 3 additions & 2 deletions src/occ_405/cmdh/cmdh_fsp_cmds.h
Expand Up @@ -157,9 +157,10 @@ typedef struct __attribute__ ((packed)) cmdh_poll_resp_v20
uint32_t errl_address;
// BYTES 13 - 14: Error Log Length
uint16_t errl_length;
// BYTES 15 - 16: Reserved
// BYTE 15: Reserved
uint8_t _reserved_15;
uint8_t _reserved_16;
// BYTE 16: GPU Configuration
uint8_t gpu_presence;
// BYTES 17 - 32 (16 bytes): OCC Code Level - ASCII string of OCC build level currently running.
uint8_t occ_level[16];
// BYTES 33 - 38 (6 bytes): ASCII eye catcher "SENSOR"
Expand Down

0 comments on commit e1a597e

Please sign in to comment.