From e1a597e9f5bc8e7b193058ca32a8c8ba46ebf519 Mon Sep 17 00:00:00 2001 From: mbroyles Date: Wed, 23 Aug 2017 16:53:05 -0500 Subject: [PATCH] Calculate GPU Power Cap Define GPU ID callout type Change-Id: I99e691abe64fc0d706571fc7a128d565159e0461 RTC: 133823 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45077 Tested-by: FSP CI Jenkins Reviewed-by: William A. Bryan Reviewed-by: Christopher J. Cain --- src/occ_405/amec/amec_master_smh.c | 51 +++++--- src/occ_405/amec/amec_pcap.c | 176 +++++++++++++++++++++++++- src/occ_405/amec/amec_service_codes.h | 1 + src/occ_405/amec/amec_sys.h | 1 + src/occ_405/cmdh/cmdh_fsp_cmds.c | 33 ++--- src/occ_405/cmdh/cmdh_fsp_cmds.h | 5 +- src/occ_405/errl/errl.h | 1 + src/occ_405/gpu/gpu.c | 17 ++- src/occ_405/occ_service_codes.h | 1 + 9 files changed, 242 insertions(+), 44 deletions(-) diff --git a/src/occ_405/amec/amec_master_smh.c b/src/occ_405/amec/amec_master_smh.c index 0fcee201..ce847495 100755 --- a/src/occ_405/amec/amec_master_smh.c +++ b/src/occ_405/amec/amec_master_smh.c @@ -39,6 +39,7 @@ #include "amec_service_codes.h" //For AMEC_MST_CHECK_PCAPS_MATCH #include "dcom.h" #include +#include // For G_apss_ch_to_function //*************************************************************************/ // Externs @@ -55,9 +56,12 @@ //Power cap mismatch threshold set to 8 ticks #define PCAPS_MISMATCH_THRESHOLD 8 -//Power cap failure threshold set to 32 ticks +//Power cap failure threshold with no GPUs set to 32 ticks #define PCAP_FAILURE_THRESHOLD 32 +//Power cap failure threshold with GPUs set to number of ticks for 100ms +#define PCAP_GPU_FAILURE_THRESHOLD (100000 / MICS_PER_TICK) + //*************************************************************************/ // Structures //*************************************************************************/ @@ -90,6 +94,8 @@ uint16_t G_mst_soft_fmax = 0xFFFF; //Counter of committed violations by the Slave OCCs uint8_t G_mst_violation_cnt[MAX_OCCS] = {0}; +extern uint32_t G_first_num_gpus_sys; + // -------------------------------------------------------- // AMEC Master State 5 Substate Table // -------------------------------------------------------- @@ -390,6 +396,8 @@ void amec_mst_check_under_pcap(void) /* Local Variables */ /*------------------------------------------------------------------------*/ errlHndl_t l_err = NULL; + uint8_t i = 0; + uint8_t l_apss_func_id = 0; /*------------------------------------------------------------------------*/ /* Code */ @@ -406,21 +414,29 @@ void amec_mst_check_under_pcap(void) G_over_cap_count++; - //Log error and reset OCC if count >= 32 (ticks) - if(G_over_cap_count >= PCAP_FAILURE_THRESHOLD) + // GPUs take longer for power limit to take effect if GPUs are present need to use + // a longer wait time before logging an error and resetting + if( ( (!G_first_num_gpus_sys) && (G_over_cap_count >= PCAP_FAILURE_THRESHOLD) ) || + ( (G_first_num_gpus_sys) && (G_over_cap_count >= PCAP_GPU_FAILURE_THRESHOLD) ) ) { TRAC_ERR("Failure to maintain power cap: Power Cap = %d ," - "PWRSYS = %d ,PWRPROC = %d ,PWRFAN = %d ," - "PWRMEM = %d",g_amec->pcap.active_node_pcap, - AMECSENSOR_PTR(PWRSYS)->sample, - AMECSENSOR_PTR(PWRPROC)->sample, - AMECSENSOR_PTR(PWRFAN)->sample, - AMECSENSOR_PTR(PWRMEM)->sample); - - TRAC_ERR("PWRIO = %d , PWRSTORE = %d, PWRGPU = %d", - AMECSENSOR_PTR(PWRIO)->sample, - AMECSENSOR_PTR(PWRSTORE)->sample, - AMECSENSOR_PTR(PWRGPU)->sample); + "PWRSYS = %d",g_amec->pcap.active_node_pcap, + AMECSENSOR_PTR(PWRSYS)->sample); + + // Trace power per APSS channel to have the best breakdown for debug + for (i = 0; i < MAX_APSS_ADC_CHANNELS; i++) + { + l_apss_func_id = G_apss_ch_to_function[i]; + + if((l_apss_func_id != ADC_RESERVED) && + (l_apss_func_id != ADC_12V_SENSE) && + (l_apss_func_id != ADC_GND_REMOTE_SENSE) && + (l_apss_func_id != ADC_12V_STANDBY_CURRENT) ) + { + TRAC_ERR("APSS channel %d Function ID = %d Power = %dW", i, l_apss_func_id, + AMECSENSOR_PTR(PWRAPSSCH0 + i)->sample); + } + } /* @ * @errortype @@ -458,11 +474,8 @@ void amec_mst_check_under_pcap(void) } else { - //Decrement count if node power under power cap value - if(G_over_cap_count > 0) - { - G_over_cap_count--; - } + // Clear counter + G_over_cap_count = 0; } return; diff --git a/src/occ_405/amec/amec_pcap.c b/src/occ_405/amec/amec_pcap.c index 286921d2..7584ddfd 100755 --- a/src/occ_405/amec/amec_pcap.c +++ b/src/occ_405/amec/amec_pcap.c @@ -58,7 +58,6 @@ extern PWR_READING_TYPE G_pwr_reading_type; //ppb_fmax #define PDROP_THRESH 0 //Number of MHz to raise the proc_pcap_vote for every watt of available power -//(DCM value should be less than SCM) #define PROC_MHZ_PER_WATT 28 //Number of MHz to raise ppb_fmax per watt of available power. Depends on //number of procs in node. @@ -75,6 +74,8 @@ uint32_t G_mhz_per_pstate=0; uint8_t G_over_pcap_count=0; extern uint16_t G_proc_fmax_mhz; // max(turbo,uturbo) frequencies +extern uint32_t G_first_proc_gpu_config; +extern uint32_t G_first_num_gpus_sys; //*************************************************************************/ // Function Prototypes @@ -84,6 +85,170 @@ extern uint16_t G_proc_fmax_mhz; // max(turbo,uturbo) frequencies // Functions //*************************************************************************/ +////////////////////////// +// Function Specification +// +// Name: amec_gpu_pcap +// +// Description: Determine power cap for GPUs +// +// Thread: Real Time Loop +// +// End Function Specification +void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power) +{ + /*------------------------------------------------------------------------*/ + /* Local Variables */ + /*------------------------------------------------------------------------*/ + uint8_t i = 0; + uint32_t l_gpu_cap_mw = 0; + static uint16_t L_total_gpu_pcap = 0; // Current total GPU pcap in effect + static uint16_t L_n_mode_gpu_total_pcap = 0; // Total GPU pcap required for oversubscription + static uint16_t L_active_psr_gpu_total_pcap = 0; // Total GPU pcap for the currently set pcap and PSR + static uint16_t L_per_gpu_pcap = 0; // Amount of L_total_gpu_pcap for each GPU + static uint8_t L_psr = 100; // PSR value used in L_active_psr_gpu_total_pcap calculation + static bool L_first_run = TRUE; // for calculations done only 1 time + + /*------------------------------------------------------------------------*/ + /* Code */ + /*------------------------------------------------------------------------*/ + // If this is the first time running calculate the total GPU power cap for oversubscription + if(L_first_run) + { + if(g_amec->pcap.ovs_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) + { + // Take all non-GPU power away from the oversubscription power cap + L_n_mode_gpu_total_pcap = g_amec->pcap.ovs_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; + // Add back in the power that will be dropped by processor DVFS and memory throttling and give to GPUs + L_n_mode_gpu_total_pcap += G_sysConfigData.total_proc_mem_pwr_drop_watts; + } + else + { + // This should not happen, the total non GPU power should never be higher than the N mode cap + // Log error and set GPUs to minimum power cap + L_n_mode_gpu_total_pcap = 0; // this will set minimum GPU power cap + + TRAC_ERR("amec_gpu_pcap: non GPU max power %dW is more than N mode pwr limit %dW", + G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.ovs_node_pcap); + + /* @ + * @errortype + * @moduleid AMEC_GPU_PCAP_MID + * @reasoncode GPU_FAILURE + * @userdata1 N mode Power Cap watts + * @userdata2 Total non-GPU power watts + * @userdata4 ERC_GPU_N_MODE_PCAP_CALC_FAILURE + * @devdesc Total non-GPU power more than N mode power cap + * + */ + errlHndl_t l_err = createErrl(AMEC_GPU_PCAP_MID, + GPU_FAILURE, + ERC_GPU_N_MODE_PCAP_CALC_FAILURE, + ERRL_SEV_PREDICTIVE, + NULL, + DEFAULT_TRACE_SIZE, + g_amec->pcap.ovs_node_pcap, + G_sysConfigData.total_non_gpu_max_pwr_watts); + + //Callout firmware + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_FIRMWARE, + ERRL_CALLOUT_PRIORITY_HIGH); + commitErrl(&l_err); + } + } // if first run + + // Calculate the total GPU power cap for the current active limit and PSR + // this only needs to be calculated if either the active limit or PSR changed + if( (L_first_run) || (i_active_pcap_changed) || (L_psr != G_sysConfigData.psr) ) + { + L_psr = G_sysConfigData.psr; + if(g_amec->pcap.active_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts) + { + // Take all non-GPU power away from the active power cap + L_active_psr_gpu_total_pcap = g_amec->pcap.active_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts; + // Add back in the power that will be dropped by processor DVFS and memory throttling based on the PSR + // to give to GPUs + L_active_psr_gpu_total_pcap += ( (L_psr / 100) * G_sysConfigData.total_proc_mem_pwr_drop_watts ); + } + else + { + // Set GPUs to minimum power cap + L_active_psr_gpu_total_pcap = 0; + TRAC_IMP("amec_gpu_pcap: non GPU max power %dW is more than active pwr limit %dW", + G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.active_node_pcap); + } + + // Total GPU power cap is the lower of oversubscription and active power limit + // must always account for oversubscription to ensure when a power supply is lost the OCC + // can react fast enough, GPU power capping is too slow and must have GPU power cap already + // set to account for oversubscription case + L_total_gpu_pcap = (L_n_mode_gpu_total_pcap < L_active_psr_gpu_total_pcap) ? + L_n_mode_gpu_total_pcap : L_active_psr_gpu_total_pcap; + + // Divide the total equally across all GPUs in the system + if(G_first_num_gpus_sys) + { + L_per_gpu_pcap = L_total_gpu_pcap / G_first_num_gpus_sys; + } + else + { + L_per_gpu_pcap = 0; + TRAC_ERR("amec_gpu_pcap: Called with no GPUs present!"); + } + } + + // Setup to send new power limit to GPUs. The actual sending of GPU power limit will be handled by task_gpu_sm() + for (i=0; igpu[i].pcap.pwr_limits_read ) + { + l_gpu_cap_mw = L_per_gpu_pcap * 1000; // convert W to mW + + // GPU is present and have min/max power limits from GPU + // clip the GPU power limit to min/max GPU limit if needed + if(l_gpu_cap_mw < g_amec->gpu[i].pcap.gpu_min_pcap_mw) // clip to min? + { + l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_min_pcap_mw; + } + else if(l_gpu_cap_mw > g_amec->gpu[i].pcap.gpu_max_pcap_mw) // clip to max? + { + l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_max_pcap_mw; + } + + // If not already at the min then set to min if trying to reduce power and proc/memory are at min + if( (i_avail_power < 0) && (g_amec->proc[0].pwr_votes.ppb_fmax == g_amec->sys.fmin) && + (g_amec->pcap.active_mem_level) && (l_gpu_cap_mw != g_amec->gpu[i].pcap.gpu_min_pcap_mw) ) + { + l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_min_pcap_mw; + if(g_amec->gpu[i].pcap.gpu_desired_pcap_mw != l_gpu_cap_mw) + { + TRAC_ERR("amec_gpu_pcap: Forcing GPU%d to minimum pwr limit %dmW", i, l_gpu_cap_mw); + g_amec->gpu[i].pcap.gpu_min_cap_required = TRUE; + } + } + + // check if this is a new power limit + if(g_amec->gpu[i].pcap.gpu_desired_pcap_mw != l_gpu_cap_mw) + { + TRAC_IMP("amec_gpu_pcap: Updating GPU%d desired pcap %dmW to %dmW", i, + g_amec->gpu[i].pcap.gpu_desired_pcap_mw, l_gpu_cap_mw); + g_amec->gpu[i].pcap.gpu_desired_pcap_mw = l_gpu_cap_mw; + + if( (g_amec->gpu[i].pcap.gpu_min_cap_required) && (l_gpu_cap_mw != g_amec->gpu[i].pcap.gpu_min_pcap_mw) ) + { + TRAC_ERR("amec_gpu_pcap: GPU%d no longer requires minimum pwr limit %dmW", i, g_amec->gpu[i].pcap.gpu_min_pcap_mw); + g_amec->gpu[i].pcap.gpu_min_cap_required = FALSE; + } + } + } + } // for each GPU + + L_first_run = FALSE; +} + ////////////////////////// // Function Specification @@ -101,6 +266,7 @@ void amec_pcap_calc(void) /* Local Variables */ /*------------------------------------------------------------------------*/ bool l_oversub_state = 0; + bool l_active_pcap_changed = FALSE; uint16_t l_node_pwr = AMECSENSOR_PTR(PWRSYS)->sample; uint16_t l_p0_pwr = AMECSENSOR_PTR(PWRPROC)->sample; int32_t l_avail_power = 0; @@ -138,9 +304,17 @@ void amec_pcap_calc(void) // set this pcap as valid (needed by master for comparison) g_amec->pcap_valid = 1; + l_active_pcap_changed = TRUE; } l_avail_power = g_amec->pcap.active_node_pcap - l_node_pwr; + + // Determine GPU power cap if there are GPUs present + if(G_first_proc_gpu_config) + { + amec_gpu_pcap(l_active_pcap_changed, l_avail_power); + } + if(l_node_pwr != 0) { l_proc_fraction = ((uint32_t)(l_p0_pwr) << 16)/l_node_pwr; diff --git a/src/occ_405/amec/amec_service_codes.h b/src/occ_405/amec/amec_service_codes.h index 47d6c09c..f206daf0 100755 --- a/src/occ_405/amec/amec_service_codes.h +++ b/src/occ_405/amec/amec_service_codes.h @@ -66,6 +66,7 @@ enum occAmecModuleId AMEC_CALC_DTS_SENSORS = AMEC_COMP_ID | 0x16, AMEC_SET_FREQ_RANGE = AMEC_COMP_ID | 0x17, AMEC_UPDATE_APSS_GPIO = AMEC_COMP_ID | 0x18, + AMEC_GPU_PCAP_MID = AMEC_COMP_ID | 0x19, }; /*----------------------------------------------------------------------------*/ diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h index a45fb42f..c084a0cc 100755 --- a/src/occ_405/amec/amec_sys.h +++ b/src/occ_405/amec/amec_sys.h @@ -449,6 +449,7 @@ typedef struct { typedef struct { bool check_pwr_limit; // Indicates if need to read power limits from GPU bool pwr_limits_read; // Indicates if power limits were read i.e. have min/max + bool gpu_min_cap_required; // Indicates if power limits were read i.e. have min/max uint32_t gpu_min_pcap_mw; // Min GPU power limit in mW read from the GPU uint32_t gpu_max_pcap_mw; // Max GPU power limit in mW read from the GPU uint32_t gpu_desired_pcap_mw; // AMEC determined pcap in mW to set diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c index c6802ee2..6cdf79ef 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds.c +++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c @@ -52,6 +52,7 @@ #include "sensor_main_memory.h" extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap; extern bool G_vrm_thermal_monitoring; +extern uint32_t G_first_proc_gpu_config; #include extern gpe_shared_data_t G_shared_gpe_data; @@ -223,7 +224,11 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr) l_poll_rsp->errl_id, l_poll_rsp->errl_length, l_poll_rsp->errl_address); } - // Byte 15 - 16: reserved. + // Byte 15: reserved. + + // Byte 16: GPU Configuration + l_poll_rsp->gpu_presence = (uint8_t)G_first_proc_gpu_config; + // Byte 17 - 32 (16 bytes): OCC level memcpy( (void *) l_poll_rsp->occ_level, (void *) &G_occ_buildname[0], 16); @@ -1165,21 +1170,17 @@ void cmdh_dbug_get_apss_data (const cmdh_fsp_cmd_t * i_cmd_ptr, //Get the data for each channel individually and write it to for (i = 0; i < MAX_APSS_ADC_CHANNELS; i++) { - - if(AMECSENSOR_PTR(PWRAPSSCH0 + i)->ipmi_sid != 0) - { - l_resp_ptr->ApssCh[i].gain = G_sysConfigData.apss_cal[i].gain; - l_resp_ptr->ApssCh[i].offset = G_sysConfigData.apss_cal[i].offset; - l_resp_ptr->ApssCh[i].raw = G_dcom_slv_inbox_rx.adc[i]; - l_resp_ptr->ApssCh[i].calculated = AMECSENSOR_PTR(PWRAPSSCH0 + i)->sample; - l_resp_ptr->ApssCh[i].func = G_apss_ch_to_function[i]; - l_resp_ptr->ApssCh[i].ipmi_sid = AMECSENSOR_PTR(PWRAPSSCH0 + i)->ipmi_sid; - - TRAC_IMP("DBG__APSS Ch[%02d]: Raw[0x%04x], Offset[0x%08x], Gain[0x%08x],", - i, l_resp_ptr->ApssCh[i].raw, l_resp_ptr->ApssCh[i].offset, l_resp_ptr->ApssCh[i].gain); - TRAC_IMP(" Pwr[0x%04x], FuncID[0x%02x], IPMI_sensorID[0x%X]", - l_resp_ptr->ApssCh[i].calculated, l_resp_ptr->ApssCh[i].func, l_resp_ptr->ApssCh[i].ipmi_sid); - } + l_resp_ptr->ApssCh[i].gain = G_sysConfigData.apss_cal[i].gain; + l_resp_ptr->ApssCh[i].offset = G_sysConfigData.apss_cal[i].offset; + l_resp_ptr->ApssCh[i].raw = G_dcom_slv_inbox_rx.adc[i]; + l_resp_ptr->ApssCh[i].calculated = AMECSENSOR_PTR(PWRAPSSCH0 + i)->sample; + l_resp_ptr->ApssCh[i].func = G_apss_ch_to_function[i]; + l_resp_ptr->ApssCh[i].ipmi_sid = AMECSENSOR_PTR(PWRAPSSCH0 + i)->ipmi_sid; + + TRAC_IMP("DBG__APSS Ch[%02d]: Raw[0x%04x], Offset[0x%08x], Gain[0x%08x],", + i, l_resp_ptr->ApssCh[i].raw, l_resp_ptr->ApssCh[i].offset, l_resp_ptr->ApssCh[i].gain); + TRAC_IMP(" Pwr[0x%04x], FuncID[0x%02x], IPMI_sensorID[0x%X]", + l_resp_ptr->ApssCh[i].calculated, l_resp_ptr->ApssCh[i].func, l_resp_ptr->ApssCh[i].ipmi_sid); } }while(0); diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.h b/src/occ_405/cmdh/cmdh_fsp_cmds.h index 3bd0f739..2f3688ff 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds.h +++ b/src/occ_405/cmdh/cmdh_fsp_cmds.h @@ -157,9 +157,10 @@ typedef struct __attribute__ ((packed)) cmdh_poll_resp_v20 uint32_t errl_address; // BYTES 13 - 14: Error Log Length uint16_t errl_length; - // BYTES 15 - 16: Reserved + // BYTE 15: Reserved uint8_t _reserved_15; - uint8_t _reserved_16; + // BYTE 16: GPU Configuration + uint8_t gpu_presence; // BYTES 17 - 32 (16 bytes): OCC Code Level - ASCII string of OCC build level currently running. uint8_t occ_level[16]; // BYTES 33 - 38 (6 bytes): ASCII eye catcher "SENSOR" diff --git a/src/occ_405/errl/errl.h b/src/occ_405/errl/errl.h index 29f7d407..ad56c9ae 100755 --- a/src/occ_405/errl/errl.h +++ b/src/occ_405/errl/errl.h @@ -144,6 +144,7 @@ typedef enum { ERRL_CALLOUT_TYPE_HUID = 0x01, ERRL_CALLOUT_TYPE_COMPONENT_ID = 0x02, + ERRL_CALLOUT_TYPE_GPU_ID = 0x03, } ERRL_CALLOUT_TYPE; /* TMGT-OCC Component Ids */ diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c index 522ea842..052039e3 100755 --- a/src/occ_405/gpu/gpu.c +++ b/src/occ_405/gpu/gpu.c @@ -389,7 +389,7 @@ void mark_gpu_failed(const gpu_sm_args_t *i_arg) if(G_sysConfigData.gpu_sensor_ids[gpu_id]) { addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_HUID, + ERRL_CALLOUT_TYPE_GPU_ID, G_sysConfigData.gpu_sensor_ids[gpu_id], ERRL_CALLOUT_PRIORITY_MED); } @@ -819,7 +819,7 @@ bool gpu_read_temp_sm() if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) { addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_HUID, + ERRL_CALLOUT_TYPE_GPU_ID, G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], ERRL_CALLOUT_PRIORITY_MED); } @@ -916,7 +916,7 @@ bool gpu_read_mem_temp_capability_sm() if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) { addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_HUID, + ERRL_CALLOUT_TYPE_GPU_ID, G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], ERRL_CALLOUT_PRIORITY_MED); } @@ -1060,7 +1060,7 @@ bool gpu_read_memory_temp_sm() if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) { addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_HUID, + ERRL_CALLOUT_TYPE_GPU_ID, G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], ERRL_CALLOUT_PRIORITY_MED); } @@ -1149,7 +1149,7 @@ bool gpu_read_memory_temp_sm() if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id]) { addCalloutToErrl(l_err, - ERRL_CALLOUT_TYPE_HUID, + ERRL_CALLOUT_TYPE_GPU_ID, G_sysConfigData.gpu_sensor_ids[G_current_gpu_id], ERRL_CALLOUT_PRIORITY_MED); } @@ -1498,6 +1498,11 @@ void task_gpu_sm(struct task *i_self) // to let IDLE SM decide what to do next g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = FALSE; g_amec->gpu[G_current_gpu_id].status.driverLoaded = FALSE; + if(g_amec->gpu[G_current_gpu_id].status.driverLoaded) + { + // Driver is loaded, read the power limits so we can start GPU power capping + g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = TRUE; + } G_gpu_state = GPU_STATE_IDLE; l_start_next_state = TRUE; } @@ -1505,7 +1510,7 @@ void task_gpu_sm(struct task *i_self) case GPU_STATE_READ_PWR_LIMIT: // Read power limits for current GPU - if(1) // TODO + if(1) // TODO read and set min/max GPU limit and set pwr_limits_read to TRUE if capping supported { // Read power limits complete for this GPU, go to IDLE state // to let IDLE SM decide what to do next diff --git a/src/occ_405/occ_service_codes.h b/src/occ_405/occ_service_codes.h index 310e76a3..bf2dbc86 100644 --- a/src/occ_405/occ_service_codes.h +++ b/src/occ_405/occ_service_codes.h @@ -289,6 +289,7 @@ enum occExtReasonCode ERC_GPU_READ_MEM_TEMP_TIMEOUT = 0x00F5, ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE = 0x00F6, ERC_GPU_INVALID_GPU_OPERATION = 0x00F7, + ERC_GPU_N_MODE_PCAP_CALC_FAILURE = 0x00F8, ERC_GPU_NO_GPE_SUPPORT = 0x00FF, ERC_STATE_FROM_ALL_TO_STB_FAILURE = 0x0123,