Skip to content

Commit

Permalink
Try to PCAP GPU again after busy failure
Browse files Browse the repository at this point in the history
CQ:SW414846

Change-Id: I7a4c42de414529da963c4f23f27b99a855a4b727
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/55109
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
  • Loading branch information
wilbryan committed Mar 7, 2018
1 parent 1c7b23c commit 81196c3
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 33 deletions.
3 changes: 2 additions & 1 deletion src/common/gpe_err.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2011,2017 */
/* Contributors Listed Below - COPYRIGHT 2011,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -56,5 +56,6 @@
#define GPE_RC_GPU_CMD_NOT_SUPPORTED 0x82 // GPU rejected command with no support
#define GPE_RC_GPU_CMD_FAILED 0x83 // An error occurred in the last GPU operation
#define GPE_RC_GPU_INIT_FAILED 0x84 // Failed to init GPU
#define GPE_RC_GPU_BUSY 0x85

#endif //_GPE_ERR_H
11 changes: 5 additions & 6 deletions src/common/gpu_structs.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2016,2017 */
/* Contributors Listed Below - COPYRIGHT 2016,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -83,12 +83,12 @@ typedef enum

// Set GPU power cap
GPU_REQ_SET_PWR_LIMIT_1_START = 0x20,
GPU_REQ_SET_PWR_LIMIT_1_1 = 0x21,
GPU_REQ_SET_PWR_LIMIT_1_2 = 0x22,
GPU_REQ_SET_PWR_LIMIT_1_2 = 0x21,
GPU_REQ_SET_PWR_LIMIT_1_3 = 0x22,
GPU_REQ_SET_PWR_LIMIT_1_FINISH = 0x23,
GPU_REQ_SET_PWR_LIMIT_2_START = 0x24,
GPU_REQ_SET_PWR_LIMIT_2_1 = 0x25,
GPU_REQ_SET_PWR_LIMIT_2_2 = 0x26,
GPU_REQ_SET_PWR_LIMIT_2_2 = 0x25,
GPU_REQ_SET_PWR_LIMIT_2_3 = 0x26,
GPU_REQ_SET_PWR_LIMIT_2_FINISH = 0x27,
GPU_REQ_SET_PWR_LIMIT_3_START = 0x28,
GPU_REQ_SET_PWR_LIMIT_3_2 = 0x29,
Expand All @@ -98,7 +98,6 @@ typedef enum
GPU_REQ_SET_PWR_LIMIT_4_2 = 0x2D,
GPU_REQ_SET_PWR_LIMIT_4_FINISH = 0x2E,


// Start check driver loaded
GPU_REQ_CHECK_DRIVER_START = 0x31,
GPU_REQ_CHECK_DRIVER_2 = 0x32,
Expand Down
12 changes: 6 additions & 6 deletions src/occ_405/cmdh/cmdh_fsp_cmds.c
Original file line number Diff line number Diff line change
Expand Up @@ -1551,37 +1551,37 @@ void cmdh_dump_gpu_timings(void)
{
TRAC_INFO("=======================================GPU%d===================================================", i);
TRAC_INFO("| Max Avg 1s count 100ms count <100ms count|");
TRAC_INFO("| Core Temperatures %-5d ticks %-5d ticks %-5d %-5d %-5d",
TRAC_INFO("| Core Temperatures %-5d msecs %-5d msecs %-5d %-5d %-5d",
G_gpu_tick_times.coretemp[i].max,
G_gpu_tick_times.coretemp[i].avg,
G_gpu_tick_times.coretemp[i].count_1s,
G_gpu_tick_times.coretemp[i].count_100ms,
G_gpu_tick_times.coretemp[i].count_lt100ms);
TRAC_INFO("| Mem Temperatures %-5d ticks %-5d ticks %-5d %-5d %-5d",
TRAC_INFO("| Mem Temperatures %-5d msecs %-5d msecs %-5d %-5d %-5d",
G_gpu_tick_times.memtemp[i].max,
G_gpu_tick_times.memtemp[i].avg,
G_gpu_tick_times.memtemp[i].count_1s,
G_gpu_tick_times.memtemp[i].count_100ms,
G_gpu_tick_times.memtemp[i].count_lt100ms);
TRAC_INFO("| Check Driver Loaded %-5d ticks %-5d ticks %-5d %-5d %-5d",
TRAC_INFO("| Check Driver Loaded %-5d msecs %-5d msecs %-5d %-5d %-5d",
G_gpu_tick_times.checkdriver[i].max,
G_gpu_tick_times.checkdriver[i].avg,
G_gpu_tick_times.checkdriver[i].count_1s,
G_gpu_tick_times.checkdriver[i].count_100ms,
G_gpu_tick_times.checkdriver[i].count_lt100ms);
TRAC_INFO("| Mem Capabilities %-5d ticks %-5d ticks %-5d %-5d %-5d",
TRAC_INFO("| Mem Capabilities %-5d msecs %-5d msecs %-5d %-5d %-5d",
G_gpu_tick_times.capabilities[i].max,
G_gpu_tick_times.capabilities[i].avg,
G_gpu_tick_times.capabilities[i].count_1s,
G_gpu_tick_times.capabilities[i].count_100ms,
G_gpu_tick_times.capabilities[i].count_lt100ms);
TRAC_INFO("| Read Power Policy %-5d ticks %-5d ticks %-5d %-5d %-5d",
TRAC_INFO("| Read Power Policy %-5d msecs %-5d msecs %-5d %-5d %-5d",
G_gpu_tick_times.getpcap[i].max,
G_gpu_tick_times.getpcap[i].avg,
G_gpu_tick_times.getpcap[i].count_1s,
G_gpu_tick_times.getpcap[i].count_100ms,
G_gpu_tick_times.getpcap[i].count_lt100ms);
TRAC_INFO("| Set Power Cap %-5d ticks %-5d ticks %-5d %-5d %-5d",
TRAC_INFO("| Set Power Cap %-5d msecs %-5d msecs %-5d %-5d %-5d",
G_gpu_tick_times.setpcap[i].max,
G_gpu_tick_times.setpcap[i].avg,
G_gpu_tick_times.setpcap[i].count_1s,
Expand Down
72 changes: 57 additions & 15 deletions src/occ_405/gpu/gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2011,2017 */
/* Contributors Listed Below - COPYRIGHT 2011,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -828,12 +828,12 @@ bool schedule_gpu_req(const gpu_op_req_e i_operation, gpu_sm_args_t i_new_args)

// Set GPU Power Limit
case GPU_REQ_SET_PWR_LIMIT_1_START:
case GPU_REQ_SET_PWR_LIMIT_1_1:
case GPU_REQ_SET_PWR_LIMIT_1_2:
case GPU_REQ_SET_PWR_LIMIT_1_3:
case GPU_REQ_SET_PWR_LIMIT_1_FINISH:
case GPU_REQ_SET_PWR_LIMIT_2_START:
case GPU_REQ_SET_PWR_LIMIT_2_1:
case GPU_REQ_SET_PWR_LIMIT_2_2:
case GPU_REQ_SET_PWR_LIMIT_2_3:
case GPU_REQ_SET_PWR_LIMIT_2_FINISH:
case GPU_REQ_SET_PWR_LIMIT_3_START:
case GPU_REQ_SET_PWR_LIMIT_3_2:
Expand Down Expand Up @@ -1336,6 +1336,8 @@ bool gpu_read_pwr_limit_sm()

static uint32_t L_num_ticks = 0;

static bool L_retry_necessary = FALSE;

L_num_ticks++;

if (async_request_is_idle(&G_gpu_op_request.request))
Expand All @@ -1354,6 +1356,12 @@ bool gpu_read_pwr_limit_sm()
L_state_failure_count = 0;
L_attempts++;
}
else if( (L_read_pwr_limit_state == GPU_STATE_READ_PWR_LIMIT_1_3) &&
(GPE_RC_GPU_BUSY == G_gpu_op_req_args.error.rc) )
{
L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_1_FINISH;
L_retry_necessary = TRUE;
}
// Check if failure was due to driver change
else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
{
Expand All @@ -1364,6 +1372,7 @@ bool gpu_read_pwr_limit_sm()
L_read_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
L_attempts = 0;
L_retry_necessary = FALSE;
return TRUE; // Done with this GPU, let GPU SM move to next
}

Expand Down Expand Up @@ -1446,7 +1455,17 @@ bool gpu_read_pwr_limit_sm()
else // success on last state go to next state and process it
{
L_state_failure_count = 0;
L_read_pwr_limit_state++;
if( (GPU_STATE_READ_PWR_LIMIT_1_FINISH == L_read_pwr_limit_state) &&
(L_retry_necessary) )
{
// Let SM move on
L_read_pwr_limit_state = GPU_STATE_READ_PWR_LIMIT_NEW;
return TRUE;
}
else
{
L_read_pwr_limit_state++;
}
}

L_scheduled = FALSE; // default nothing scheduled
Expand All @@ -1455,7 +1474,8 @@ bool gpu_read_pwr_limit_sm()
{
// Step 1
case GPU_STATE_READ_PWR_LIMIT_1_START:
L_num_ticks = 1;
if(!L_retry_necessary) L_num_ticks = 1;
L_retry_necessary = FALSE;
L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_START, G_new_gpu_req_args);
break;

Expand Down Expand Up @@ -1619,6 +1639,8 @@ bool gpu_set_pwr_limit_sm()

static uint32_t L_num_ticks = 0;

static bool L_retry_necessary = FALSE;

L_num_ticks++;

if (async_request_is_idle(&G_gpu_op_request.request))
Expand All @@ -1637,6 +1659,12 @@ bool gpu_set_pwr_limit_sm()
L_state_failure_count = 0;
L_attempts++;
}
else if( (L_set_pwr_limit_state == GPU_STATE_SET_PWR_LIMIT_3_3) &&
(GPE_RC_GPU_BUSY == G_gpu_op_req_args.error.rc) )
{
L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_3_FINISH;
L_retry_necessary = TRUE;
}
// Check if failure was due to driver change
else if(G_gpu_op_req_args.error.rc == GPE_RC_GPU_DRIVER_CHANGE)
{
Expand All @@ -1647,6 +1675,7 @@ bool gpu_set_pwr_limit_sm()
L_set_pwr_limit_failure_count[G_current_gpu_id] = 0; // clear failure count since there's a driver change
L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
L_attempts = 0;
L_retry_necessary = FALSE;
return TRUE; // Done with this GPU, let GPU SM move to next
}
// If reached retry count give up on this read
Expand All @@ -1660,6 +1689,8 @@ bool gpu_set_pwr_limit_sm()
// It will be retried if detected that GPU is put in reset and then taken out or driver change
g_amec->gpu[G_current_gpu_id].pcap.set_failed = true;
L_set_pwr_limit_failure_count[G_current_gpu_id] = 0;
INTR_TRAC_ERR("gpu_set_pwr_limit: Timedout setting power limit %d for GPU%d [attempts:%d][state_fail:%d]",
G_gpu_op_req_args.data[0], G_current_gpu_id, L_attempts, L_state_failure_count);
}
// if GPU is not in reset then INC error count and check if reached threshold
if(g_amec->gpu[G_current_gpu_id].status.notReset)
Expand Down Expand Up @@ -1730,7 +1761,17 @@ bool gpu_set_pwr_limit_sm()
else // success on last state go to next state and process it
{
L_state_failure_count = 0;
L_set_pwr_limit_state++;
if( (GPU_STATE_SET_PWR_LIMIT_4_FINISH == L_set_pwr_limit_state ) &&
(L_retry_necessary) )
{
// Let SM move to next
L_set_pwr_limit_state = GPU_STATE_SET_PWR_LIMIT_NEW;
return TRUE;
}
else
{
L_set_pwr_limit_state++;
}
}

L_scheduled = FALSE; // default nothing scheduled
Expand All @@ -1739,18 +1780,19 @@ bool gpu_set_pwr_limit_sm()
{
// Step 1
case GPU_STATE_SET_PWR_LIMIT_1_START:
L_num_ticks = 1;
if(!L_retry_necessary) L_num_ticks = 1;
L_retry_necessary = FALSE;
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_START, G_new_gpu_req_args);
break;

case GPU_STATE_SET_PWR_LIMIT_1_1:
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_1, G_new_gpu_req_args);
break;

case GPU_STATE_SET_PWR_LIMIT_1_2:
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_2, G_new_gpu_req_args);
break;

case GPU_STATE_SET_PWR_LIMIT_1_3:
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_3, G_new_gpu_req_args);
break;

case GPU_STATE_SET_PWR_LIMIT_1_FINISH:
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_FINISH, G_new_gpu_req_args);
break;
Expand All @@ -1764,14 +1806,14 @@ bool gpu_set_pwr_limit_sm()
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_START, G_new_gpu_req_args);
break;

case GPU_STATE_SET_PWR_LIMIT_2_1:
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_1, G_new_gpu_req_args);
break;

case GPU_STATE_SET_PWR_LIMIT_2_2:
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_2, G_new_gpu_req_args);
break;

case GPU_STATE_SET_PWR_LIMIT_2_3:
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_3, G_new_gpu_req_args);
break;

case GPU_STATE_SET_PWR_LIMIT_2_FINISH:
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_2_FINISH, G_new_gpu_req_args);
break;
Expand Down
10 changes: 5 additions & 5 deletions src/occ_405/gpu/gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2011,2017 */
/* Contributors Listed Below - COPYRIGHT 2011,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -133,12 +133,12 @@ typedef enum
{
GPU_STATE_SET_PWR_LIMIT_NEW = 0x71,
GPU_STATE_SET_PWR_LIMIT_1_START = 0x72,
GPU_STATE_SET_PWR_LIMIT_1_1 = 0x73,
GPU_STATE_SET_PWR_LIMIT_1_2 = 0x74,
GPU_STATE_SET_PWR_LIMIT_1_2 = 0x73,
GPU_STATE_SET_PWR_LIMIT_1_3 = 0x74,
GPU_STATE_SET_PWR_LIMIT_1_FINISH = 0x75,
GPU_STATE_SET_PWR_LIMIT_2_START = 0x76,
GPU_STATE_SET_PWR_LIMIT_2_1 = 0x77,
GPU_STATE_SET_PWR_LIMIT_2_2 = 0x78,
GPU_STATE_SET_PWR_LIMIT_2_2 = 0x77,
GPU_STATE_SET_PWR_LIMIT_2_3 = 0x78,
GPU_STATE_SET_PWR_LIMIT_2_FINISH = 0x79,
GPU_STATE_SET_PWR_LIMIT_3_START = 0x7A,
GPU_STATE_SET_PWR_LIMIT_3_2 = 0x7B,
Expand Down

0 comments on commit 81196c3

Please sign in to comment.