From e1a597e9f5bc8e7b193058ca32a8c8ba46ebf519 Mon Sep 17 00:00:00 2001
From: mbroyles <mbroyles@us.ibm.com>
Date: Wed, 23 Aug 2017 16:53:05 -0500
Subject: [PATCH] Calculate GPU Power Cap

Define GPU ID callout type

Change-Id: I99e691abe64fc0d706571fc7a128d565159e0461
RTC: 133823
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45077
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
---
 src/occ_405/amec/amec_master_smh.c    |  51 +++++---
 src/occ_405/amec/amec_pcap.c          | 176 +++++++++++++++++++++++++-
 src/occ_405/amec/amec_service_codes.h |   1 +
 src/occ_405/amec/amec_sys.h           |   1 +
 src/occ_405/cmdh/cmdh_fsp_cmds.c      |  33 ++---
 src/occ_405/cmdh/cmdh_fsp_cmds.h      |   5 +-
 src/occ_405/errl/errl.h               |   1 +
 src/occ_405/gpu/gpu.c                 |  17 ++-
 src/occ_405/occ_service_codes.h       |   1 +
 9 files changed, 242 insertions(+), 44 deletions(-)

diff --git a/src/occ_405/amec/amec_master_smh.c b/src/occ_405/amec/amec_master_smh.c
index 0fcee201..ce847495 100755
--- a/src/occ_405/amec/amec_master_smh.c
+++ b/src/occ_405/amec/amec_master_smh.c
@@ -39,6 +39,7 @@
 #include "amec_service_codes.h" //For AMEC_MST_CHECK_PCAPS_MATCH
 #include "dcom.h"
 #include <amec_sensors_power.h>
+#include <cmdh_fsp_cmds.h>      // For G_apss_ch_to_function
 
 //*************************************************************************/
 // Externs
@@ -55,9 +56,12 @@
 //Power cap mismatch threshold set to 8 ticks
 #define PCAPS_MISMATCH_THRESHOLD 8
 
-//Power cap failure threshold set to 32 ticks
+//Power cap failure threshold with no GPUs set to 32 ticks
 #define PCAP_FAILURE_THRESHOLD 32
 
+//Power cap failure threshold with GPUs set to number of ticks for 100ms
+#define PCAP_GPU_FAILURE_THRESHOLD (100000 / MICS_PER_TICK)
+
 //*************************************************************************/
 // Structures
 //*************************************************************************/
@@ -90,6 +94,8 @@ uint16_t G_mst_soft_fmax = 0xFFFF;
 //Counter of committed violations by the Slave OCCs
 uint8_t  G_mst_violation_cnt[MAX_OCCS] = {0};
 
+extern uint32_t G_first_num_gpus_sys;
+
 // --------------------------------------------------------
 // AMEC Master State 5 Substate Table
 // --------------------------------------------------------
@@ -390,6 +396,8 @@ void amec_mst_check_under_pcap(void)
     /*  Local Variables                                                       */
     /*------------------------------------------------------------------------*/
     errlHndl_t l_err = NULL;
+    uint8_t i = 0;
+    uint8_t l_apss_func_id = 0;
 
     /*------------------------------------------------------------------------*/
     /*  Code                                                                  */
@@ -406,21 +414,29 @@ void amec_mst_check_under_pcap(void)
 
         G_over_cap_count++;
 
-        //Log error and reset OCC if count >= 32 (ticks)
-        if(G_over_cap_count >= PCAP_FAILURE_THRESHOLD)
+        // GPUs take longer for power limit to take effect if GPUs are present need to use
+        // a longer wait time before logging an error and resetting
+        if( ( (!G_first_num_gpus_sys) && (G_over_cap_count >= PCAP_FAILURE_THRESHOLD) ) ||
+            ( (G_first_num_gpus_sys) && (G_over_cap_count >= PCAP_GPU_FAILURE_THRESHOLD) ) )
         {
             TRAC_ERR("Failure to maintain power cap: Power Cap = %d ,"
-                     "PWRSYS = %d ,PWRPROC = %d ,PWRFAN = %d ,"
-                     "PWRMEM = %d",g_amec->pcap.active_node_pcap,
-                     AMECSENSOR_PTR(PWRSYS)->sample,
-                     AMECSENSOR_PTR(PWRPROC)->sample,
-                     AMECSENSOR_PTR(PWRFAN)->sample,
-                     AMECSENSOR_PTR(PWRMEM)->sample);
-
-            TRAC_ERR("PWRIO = %d , PWRSTORE = %d, PWRGPU = %d",
-                     AMECSENSOR_PTR(PWRIO)->sample,
-                     AMECSENSOR_PTR(PWRSTORE)->sample,
-                     AMECSENSOR_PTR(PWRGPU)->sample);
+                     "PWRSYS = %d",g_amec->pcap.active_node_pcap,
+                     AMECSENSOR_PTR(PWRSYS)->sample);
+
+            // Trace power per APSS channel to have the best breakdown for debug
+            for (i = 0; i < MAX_APSS_ADC_CHANNELS; i++)
+            {
+                l_apss_func_id = G_apss_ch_to_function[i];
+
+                if((l_apss_func_id != ADC_RESERVED) &&
+                   (l_apss_func_id != ADC_12V_SENSE) &&
+                   (l_apss_func_id != ADC_GND_REMOTE_SENSE) &&
+                   (l_apss_func_id != ADC_12V_STANDBY_CURRENT) )
+                {
+                    TRAC_ERR("APSS channel %d Function ID = %d Power = %dW", i, l_apss_func_id,
+                              AMECSENSOR_PTR(PWRAPSSCH0 + i)->sample);
+                }
+            }
 
             /* @
              * @errortype
@@ -458,11 +474,8 @@ void amec_mst_check_under_pcap(void)
     }
     else
     {
-        //Decrement count if node power under power cap value
-        if(G_over_cap_count > 0)
-        {
-            G_over_cap_count--;
-        }
+        // Clear counter
+        G_over_cap_count = 0;
     }
 
     return;
diff --git a/src/occ_405/amec/amec_pcap.c b/src/occ_405/amec/amec_pcap.c
index 286921d2..7584ddfd 100755
--- a/src/occ_405/amec/amec_pcap.c
+++ b/src/occ_405/amec/amec_pcap.c
@@ -58,7 +58,6 @@ extern PWR_READING_TYPE  G_pwr_reading_type;
 //ppb_fmax
 #define PDROP_THRESH        0
 //Number of MHz to raise the proc_pcap_vote for every watt of available power
-//(DCM value should be less than SCM)
 #define PROC_MHZ_PER_WATT   28
 //Number of MHz to raise ppb_fmax per watt of available power. Depends on
 //number of procs in node.
@@ -75,6 +74,8 @@ uint32_t    G_mhz_per_pstate=0;
 uint8_t     G_over_pcap_count=0;
 
 extern uint16_t G_proc_fmax_mhz;   // max(turbo,uturbo) frequencies
+extern uint32_t G_first_proc_gpu_config;
+extern uint32_t G_first_num_gpus_sys;
 
 //*************************************************************************/
 // Function Prototypes
@@ -84,6 +85,170 @@ extern uint16_t G_proc_fmax_mhz;   // max(turbo,uturbo) frequencies
 // Functions
 //*************************************************************************/
 
+//////////////////////////
+// Function Specification
+//
+// Name: amec_gpu_pcap
+//
+// Description: Determine power cap for GPUs
+//
+// Thread: Real Time Loop
+//
+// End Function Specification
+void amec_gpu_pcap(bool i_active_pcap_changed, int32_t i_avail_power)
+{
+    /*------------------------------------------------------------------------*/
+    /*  Local Variables                                                       */
+    /*------------------------------------------------------------------------*/
+    uint8_t  i = 0;
+    uint32_t l_gpu_cap_mw = 0;
+    static uint16_t L_total_gpu_pcap = 0;  // Current total GPU pcap in effect
+    static uint16_t L_n_mode_gpu_total_pcap = 0;  // Total GPU pcap required for oversubscription
+    static uint16_t L_active_psr_gpu_total_pcap = 0; // Total GPU pcap for the currently set pcap and PSR
+    static uint16_t L_per_gpu_pcap = 0;  // Amount of L_total_gpu_pcap for each GPU
+    static uint8_t L_psr = 100;   // PSR value used in L_active_psr_gpu_total_pcap calculation
+    static bool L_first_run = TRUE;  // for calculations done only 1 time
+
+    /*------------------------------------------------------------------------*/
+    /*  Code                                                                  */
+    /*------------------------------------------------------------------------*/
+    // If this is the first time running calculate the total GPU power cap for oversubscription
+    if(L_first_run)
+    {
+       if(g_amec->pcap.ovs_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts)
+       {
+           // Take all non-GPU power away from the oversubscription power cap
+           L_n_mode_gpu_total_pcap = g_amec->pcap.ovs_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts;
+           // Add back in the power that will be dropped by processor DVFS and memory throttling and give to GPUs
+           L_n_mode_gpu_total_pcap += G_sysConfigData.total_proc_mem_pwr_drop_watts;
+       }
+       else
+       {
+           // This should not happen, the total non GPU power should never be higher than the N mode cap
+           // Log error and set GPUs to minimum power cap
+           L_n_mode_gpu_total_pcap = 0; // this will set minimum GPU power cap
+
+           TRAC_ERR("amec_gpu_pcap: non GPU max power %dW is more than N mode pwr limit %dW",
+                     G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.ovs_node_pcap);
+
+           /* @
+            * @errortype
+            * @moduleid    AMEC_GPU_PCAP_MID
+            * @reasoncode  GPU_FAILURE
+            * @userdata1   N mode Power Cap watts
+            * @userdata2   Total non-GPU power watts
+            * @userdata4   ERC_GPU_N_MODE_PCAP_CALC_FAILURE
+            * @devdesc     Total non-GPU power more than N mode power cap
+            *
+            */
+           errlHndl_t l_err = createErrl(AMEC_GPU_PCAP_MID,
+                                         GPU_FAILURE,
+                                         ERC_GPU_N_MODE_PCAP_CALC_FAILURE,
+                                         ERRL_SEV_PREDICTIVE,
+                                         NULL,
+                                         DEFAULT_TRACE_SIZE,
+                                         g_amec->pcap.ovs_node_pcap,
+                                         G_sysConfigData.total_non_gpu_max_pwr_watts);
+
+           //Callout firmware
+           addCalloutToErrl(l_err,
+                            ERRL_CALLOUT_TYPE_COMPONENT_ID,
+                            ERRL_COMPONENT_ID_FIRMWARE,
+                            ERRL_CALLOUT_PRIORITY_HIGH);
+           commitErrl(&l_err);
+       }
+    }  // if first run
+
+    // Calculate the total GPU power cap for the current active limit and PSR
+    // this only needs to be calculated if either the active limit or PSR changed
+    if( (L_first_run) || (i_active_pcap_changed) || (L_psr != G_sysConfigData.psr) )
+    {
+       L_psr = G_sysConfigData.psr;
+       if(g_amec->pcap.active_node_pcap > G_sysConfigData.total_non_gpu_max_pwr_watts)
+       {
+           // Take all non-GPU power away from the active power cap
+           L_active_psr_gpu_total_pcap = g_amec->pcap.active_node_pcap - G_sysConfigData.total_non_gpu_max_pwr_watts;
+           // Add back in the power that will be dropped by processor DVFS and memory throttling based on the PSR
+           // to give to GPUs
+           L_active_psr_gpu_total_pcap += ( (L_psr / 100) * G_sysConfigData.total_proc_mem_pwr_drop_watts );
+       }
+       else
+       {
+           // Set GPUs to minimum power cap
+           L_active_psr_gpu_total_pcap = 0;
+           TRAC_IMP("amec_gpu_pcap: non GPU max power %dW is more than active pwr limit %dW",
+                     G_sysConfigData.total_non_gpu_max_pwr_watts, g_amec->pcap.active_node_pcap);
+       }
+
+       // Total GPU power cap is the lower of oversubscription and active power limit
+       // must always account for oversubscription to ensure when a power supply is lost the OCC
+       // can react fast enough, GPU power capping is too slow and must have GPU power cap already
+       // set to account for oversubscription case
+       L_total_gpu_pcap = (L_n_mode_gpu_total_pcap < L_active_psr_gpu_total_pcap) ?
+                           L_n_mode_gpu_total_pcap : L_active_psr_gpu_total_pcap;
+
+       // Divide the total equally across all GPUs in the system
+       if(G_first_num_gpus_sys)
+       {
+          L_per_gpu_pcap = L_total_gpu_pcap / G_first_num_gpus_sys;
+       }
+       else
+       {
+           L_per_gpu_pcap = 0;
+           TRAC_ERR("amec_gpu_pcap: Called with no GPUs present!");
+       }
+    }
+
+    // Setup to send new power limit to GPUs. The actual sending of GPU power limit will be handled by task_gpu_sm()
+    for (i=0; i<MAX_NUM_GPU_PER_DOMAIN; i++)
+    {
+        // Before sending a GPU a power limit the power limits must be read from the GPU to know min/max GPU allows
+        if( GPU_PRESENT(i) && g_amec->gpu[i].pcap.pwr_limits_read )
+        {
+           l_gpu_cap_mw = L_per_gpu_pcap * 1000;  // convert W to mW
+
+           // GPU is present and have min/max power limits from GPU
+           // clip the GPU power limit to min/max GPU limit if needed
+           if(l_gpu_cap_mw < g_amec->gpu[i].pcap.gpu_min_pcap_mw)  // clip to min?
+           {
+              l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_min_pcap_mw;
+           }
+           else if(l_gpu_cap_mw > g_amec->gpu[i].pcap.gpu_max_pcap_mw)  // clip to max?
+           {
+              l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_max_pcap_mw;
+           }
+
+           // If not already at the min then set to min if trying to reduce power and proc/memory are at min
+           if( (i_avail_power < 0) && (g_amec->proc[0].pwr_votes.ppb_fmax == g_amec->sys.fmin) &&
+               (g_amec->pcap.active_mem_level) && (l_gpu_cap_mw != g_amec->gpu[i].pcap.gpu_min_pcap_mw) )
+           {
+              l_gpu_cap_mw = g_amec->gpu[i].pcap.gpu_min_pcap_mw;
+              if(g_amec->gpu[i].pcap.gpu_desired_pcap_mw != l_gpu_cap_mw)
+              {
+                 TRAC_ERR("amec_gpu_pcap: Forcing GPU%d to minimum pwr limit %dmW", i, l_gpu_cap_mw);
+                 g_amec->gpu[i].pcap.gpu_min_cap_required = TRUE;
+              }
+           }
+
+           // check if this is a new power limit
+           if(g_amec->gpu[i].pcap.gpu_desired_pcap_mw != l_gpu_cap_mw)
+           {
+              TRAC_IMP("amec_gpu_pcap: Updating GPU%d desired pcap %dmW to %dmW", i,
+                        g_amec->gpu[i].pcap.gpu_desired_pcap_mw, l_gpu_cap_mw);
+              g_amec->gpu[i].pcap.gpu_desired_pcap_mw = l_gpu_cap_mw;
+
+              if( (g_amec->gpu[i].pcap.gpu_min_cap_required) && (l_gpu_cap_mw != g_amec->gpu[i].pcap.gpu_min_pcap_mw) )
+              {
+                 TRAC_ERR("amec_gpu_pcap: GPU%d no longer requires minimum pwr limit %dmW", i, g_amec->gpu[i].pcap.gpu_min_pcap_mw);
+                 g_amec->gpu[i].pcap.gpu_min_cap_required = FALSE;
+              }
+           }
+        }
+    }  // for each GPU
+
+    L_first_run = FALSE;
+}
+
 
 //////////////////////////
 // Function Specification
@@ -101,6 +266,7 @@ void amec_pcap_calc(void)
     /*  Local Variables                                                       */
     /*------------------------------------------------------------------------*/
     bool l_oversub_state  = 0;
+    bool l_active_pcap_changed = FALSE;
     uint16_t l_node_pwr = AMECSENSOR_PTR(PWRSYS)->sample;
     uint16_t l_p0_pwr   = AMECSENSOR_PTR(PWRPROC)->sample;
     int32_t l_avail_power = 0;
@@ -138,9 +304,17 @@ void amec_pcap_calc(void)
 
         // set this pcap as valid (needed by master for comparison)
         g_amec->pcap_valid = 1;
+        l_active_pcap_changed = TRUE;
     }
 
     l_avail_power = g_amec->pcap.active_node_pcap - l_node_pwr;
+
+    // Determine GPU power cap if there are GPUs present
+    if(G_first_proc_gpu_config)
+    {
+       amec_gpu_pcap(l_active_pcap_changed, l_avail_power);
+    }
+
     if(l_node_pwr != 0)
     {
         l_proc_fraction = ((uint32_t)(l_p0_pwr) << 16)/l_node_pwr;
diff --git a/src/occ_405/amec/amec_service_codes.h b/src/occ_405/amec/amec_service_codes.h
index 47d6c09c..f206daf0 100755
--- a/src/occ_405/amec/amec_service_codes.h
+++ b/src/occ_405/amec/amec_service_codes.h
@@ -66,6 +66,7 @@ enum occAmecModuleId
     AMEC_CALC_DTS_SENSORS            =    AMEC_COMP_ID | 0x16,
     AMEC_SET_FREQ_RANGE              =    AMEC_COMP_ID | 0x17,
     AMEC_UPDATE_APSS_GPIO            =    AMEC_COMP_ID | 0x18,
+    AMEC_GPU_PCAP_MID                =    AMEC_COMP_ID | 0x19,
 };
 
 /*----------------------------------------------------------------------------*/
diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h
index a45fb42f..c084a0cc 100755
--- a/src/occ_405/amec/amec_sys.h
+++ b/src/occ_405/amec/amec_sys.h
@@ -449,6 +449,7 @@ typedef struct {
 typedef struct {
     bool     check_pwr_limit; // Indicates if need to read power limits from GPU
     bool     pwr_limits_read;   // Indicates if power limits were read i.e. have min/max
+    bool     gpu_min_cap_required;   // Indicates if power limits were read i.e. have min/max
     uint32_t gpu_min_pcap_mw; // Min GPU power limit in mW read from the GPU
     uint32_t gpu_max_pcap_mw; // Max GPU power limit in mW read from the GPU
     uint32_t gpu_desired_pcap_mw;  // AMEC determined pcap in mW to set
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c
index c6802ee2..6cdf79ef 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.c
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c
@@ -52,6 +52,7 @@
 #include "sensor_main_memory.h"
 extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap;
 extern bool G_vrm_thermal_monitoring;
+extern uint32_t G_first_proc_gpu_config;
 
 #include <gpe_export.h>
 extern gpe_shared_data_t G_shared_gpe_data;
@@ -223,7 +224,11 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
                  l_poll_rsp->errl_id, l_poll_rsp->errl_length, l_poll_rsp->errl_address);
     }
 
-    // Byte 15 - 16: reserved.
+    // Byte 15: reserved.
+
+    // Byte 16: GPU Configuration
+    l_poll_rsp->gpu_presence = (uint8_t)G_first_proc_gpu_config;
+
     // Byte 17 - 32 (16 bytes): OCC level
     memcpy( (void *) l_poll_rsp->occ_level, (void *) &G_occ_buildname[0], 16);
 
@@ -1165,21 +1170,17 @@ void cmdh_dbug_get_apss_data (const cmdh_fsp_cmd_t * i_cmd_ptr,
         //Get the data for each channel individually and write it to
         for (i = 0; i < MAX_APSS_ADC_CHANNELS; i++)
         {
-
-            if(AMECSENSOR_PTR(PWRAPSSCH0 + i)->ipmi_sid != 0)
-            {
-                l_resp_ptr->ApssCh[i].gain = G_sysConfigData.apss_cal[i].gain;
-                l_resp_ptr->ApssCh[i].offset = G_sysConfigData.apss_cal[i].offset;
-                l_resp_ptr->ApssCh[i].raw = G_dcom_slv_inbox_rx.adc[i];
-                l_resp_ptr->ApssCh[i].calculated = AMECSENSOR_PTR(PWRAPSSCH0 + i)->sample;
-                l_resp_ptr->ApssCh[i].func = G_apss_ch_to_function[i];
-                l_resp_ptr->ApssCh[i].ipmi_sid = AMECSENSOR_PTR(PWRAPSSCH0 + i)->ipmi_sid;
-
-                TRAC_IMP("DBG__APSS Ch[%02d]:  Raw[0x%04x], Offset[0x%08x], Gain[0x%08x],",
-                         i, l_resp_ptr->ApssCh[i].raw, l_resp_ptr->ApssCh[i].offset, l_resp_ptr->ApssCh[i].gain);
-                TRAC_IMP("                     Pwr[0x%04x], FuncID[0x%02x], IPMI_sensorID[0x%X]",
-                         l_resp_ptr->ApssCh[i].calculated, l_resp_ptr->ApssCh[i].func, l_resp_ptr->ApssCh[i].ipmi_sid);
-            }
+            l_resp_ptr->ApssCh[i].gain = G_sysConfigData.apss_cal[i].gain;
+            l_resp_ptr->ApssCh[i].offset = G_sysConfigData.apss_cal[i].offset;
+            l_resp_ptr->ApssCh[i].raw = G_dcom_slv_inbox_rx.adc[i];
+            l_resp_ptr->ApssCh[i].calculated = AMECSENSOR_PTR(PWRAPSSCH0 + i)->sample;
+            l_resp_ptr->ApssCh[i].func = G_apss_ch_to_function[i];
+            l_resp_ptr->ApssCh[i].ipmi_sid = AMECSENSOR_PTR(PWRAPSSCH0 + i)->ipmi_sid;
+
+            TRAC_IMP("DBG__APSS Ch[%02d]:  Raw[0x%04x], Offset[0x%08x], Gain[0x%08x],",
+                      i, l_resp_ptr->ApssCh[i].raw, l_resp_ptr->ApssCh[i].offset, l_resp_ptr->ApssCh[i].gain);
+            TRAC_IMP("                     Pwr[0x%04x], FuncID[0x%02x], IPMI_sensorID[0x%X]",
+                      l_resp_ptr->ApssCh[i].calculated, l_resp_ptr->ApssCh[i].func, l_resp_ptr->ApssCh[i].ipmi_sid);
         }
 
     }while(0);
diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.h b/src/occ_405/cmdh/cmdh_fsp_cmds.h
index 3bd0f739..2f3688ff 100755
--- a/src/occ_405/cmdh/cmdh_fsp_cmds.h
+++ b/src/occ_405/cmdh/cmdh_fsp_cmds.h
@@ -157,9 +157,10 @@ typedef struct __attribute__ ((packed)) cmdh_poll_resp_v20
     uint32_t  errl_address;
     // BYTES 13 - 14: Error Log Length
     uint16_t  errl_length;
-    // BYTES 15 - 16: Reserved
+    // BYTE 15: Reserved
     uint8_t   _reserved_15;
-    uint8_t   _reserved_16;
+    // BYTE 16: GPU Configuration
+    uint8_t   gpu_presence;
     // BYTES 17 - 32 (16 bytes): OCC Code Level - ASCII string of OCC build level currently running.
     uint8_t   occ_level[16];
     // BYTES 33 - 38 (6 bytes):  ASCII eye catcher "SENSOR"
diff --git a/src/occ_405/errl/errl.h b/src/occ_405/errl/errl.h
index 29f7d407..ad56c9ae 100755
--- a/src/occ_405/errl/errl.h
+++ b/src/occ_405/errl/errl.h
@@ -144,6 +144,7 @@ typedef enum
 {
     ERRL_CALLOUT_TYPE_HUID          = 0x01,
     ERRL_CALLOUT_TYPE_COMPONENT_ID  = 0x02,
+    ERRL_CALLOUT_TYPE_GPU_ID        = 0x03,
 } ERRL_CALLOUT_TYPE;
 
 /* TMGT-OCC Component Ids */
diff --git a/src/occ_405/gpu/gpu.c b/src/occ_405/gpu/gpu.c
index 522ea842..052039e3 100755
--- a/src/occ_405/gpu/gpu.c
+++ b/src/occ_405/gpu/gpu.c
@@ -389,7 +389,7 @@ void mark_gpu_failed(const gpu_sm_args_t *i_arg)
             if(G_sysConfigData.gpu_sensor_ids[gpu_id])
             {
                addCalloutToErrl(l_err,
-                                ERRL_CALLOUT_TYPE_HUID,
+                                ERRL_CALLOUT_TYPE_GPU_ID,
                                 G_sysConfigData.gpu_sensor_ids[gpu_id],
                                 ERRL_CALLOUT_PRIORITY_MED);
             }
@@ -819,7 +819,7 @@ bool gpu_read_temp_sm()
                     if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
                     {
                         addCalloutToErrl(l_err,
-                                         ERRL_CALLOUT_TYPE_HUID,
+                                         ERRL_CALLOUT_TYPE_GPU_ID,
                                          G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
                                          ERRL_CALLOUT_PRIORITY_MED);
                     }
@@ -916,7 +916,7 @@ bool gpu_read_mem_temp_capability_sm()
              if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
              {
                 addCalloutToErrl(l_err,
-                                 ERRL_CALLOUT_TYPE_HUID,
+                                 ERRL_CALLOUT_TYPE_GPU_ID,
                                  G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
                                  ERRL_CALLOUT_PRIORITY_MED);
              }
@@ -1060,7 +1060,7 @@ bool gpu_read_memory_temp_sm()
                  if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
                  {
                     addCalloutToErrl(l_err,
-                                     ERRL_CALLOUT_TYPE_HUID,
+                                     ERRL_CALLOUT_TYPE_GPU_ID,
                                      G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
                                      ERRL_CALLOUT_PRIORITY_MED);
                  }
@@ -1149,7 +1149,7 @@ bool gpu_read_memory_temp_sm()
                    if(G_sysConfigData.gpu_sensor_ids[G_current_gpu_id])
                    {
                       addCalloutToErrl(l_err,
-                                       ERRL_CALLOUT_TYPE_HUID,
+                                       ERRL_CALLOUT_TYPE_GPU_ID,
                                        G_sysConfigData.gpu_sensor_ids[G_current_gpu_id],
                                        ERRL_CALLOUT_PRIORITY_MED);
                    }
@@ -1498,6 +1498,11 @@ void task_gpu_sm(struct task *i_self)
                       // to let IDLE SM decide what to do next
                       g_amec->gpu[G_current_gpu_id].status.checkDriverLoaded = FALSE;
                       g_amec->gpu[G_current_gpu_id].status.driverLoaded = FALSE;
+                      if(g_amec->gpu[G_current_gpu_id].status.driverLoaded)
+                      {
+                          // Driver is loaded, read the power limits so we can start GPU power capping
+                          g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = TRUE;
+                      }
                       G_gpu_state = GPU_STATE_IDLE;
                       l_start_next_state = TRUE;
                    }
@@ -1505,7 +1510,7 @@ void task_gpu_sm(struct task *i_self)
 
                case GPU_STATE_READ_PWR_LIMIT:
                    // Read power limits for current GPU
-                   if(1) // TODO
+                   if(1) // TODO read and set min/max GPU limit and set pwr_limits_read to TRUE if capping supported
                    {
                       // Read power limits complete for this GPU, go to IDLE state
                       // to let IDLE SM decide what to do next
diff --git a/src/occ_405/occ_service_codes.h b/src/occ_405/occ_service_codes.h
index 310e76a3..bf2dbc86 100644
--- a/src/occ_405/occ_service_codes.h
+++ b/src/occ_405/occ_service_codes.h
@@ -289,6 +289,7 @@ enum occExtReasonCode
     ERC_GPU_READ_MEM_TEMP_TIMEOUT               = 0x00F5,
     ERC_GPU_READ_MEM_TEMP_CAPABLE_FAILURE       = 0x00F6,
     ERC_GPU_INVALID_GPU_OPERATION               = 0x00F7,
+    ERC_GPU_N_MODE_PCAP_CALC_FAILURE            = 0x00F8,
     ERC_GPU_NO_GPE_SUPPORT                      = 0x00FF,
 
     ERC_STATE_FROM_ALL_TO_STB_FAILURE           = 0x0123,