Skip to content

Commit

Permalink
GPU Timing Measurement Debug Command
Browse files Browse the repository at this point in the history
Change-Id: I5d37db9ba1aa9dc90b09266da6762121195d2385
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/48629
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
  • Loading branch information
wilbryan committed Oct 19, 2017
1 parent c07a720 commit bacb45a
Show file tree
Hide file tree
Showing 4 changed files with 148 additions and 3 deletions.
55 changes: 55 additions & 0 deletions src/occ_405/cmdh/cmdh_fsp_cmds.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
#include <avsbus.h>
#include "wof.h"
#include "sensor_main_memory.h"
#include "gpu.h"

extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap;
extern bool G_vrm_thermal_monitoring;
extern uint32_t G_first_proc_gpu_config;
Expand Down Expand Up @@ -1416,6 +1418,55 @@ void cmdh_dbug_clear_ame_sensor(const cmdh_fsp_cmd_t * i_cmd_ptr,
G_rsp_status = l_rc;
}

void cmdh_dump_gpu_timings(void)
{
extern gpuTimingTable_t G_gpu_tick_times;
int i = 0;

for( ; i < MAX_NUM_GPU_PER_DOMAIN; i++)
{
TRAC_INFO("=======================================GPU%d===================================================", i);
TRAC_INFO("| Max Avg 1s count 100ms count <100ms count|");
TRAC_INFO("| Core Temperatures %-5d ticks %-5d ticks %-5d %-5d %-5d",
G_gpu_tick_times.coretemp[i].max,
G_gpu_tick_times.coretemp[i].avg,
G_gpu_tick_times.coretemp[i].count_1s,
G_gpu_tick_times.coretemp[i].count_100ms,
G_gpu_tick_times.coretemp[i].count_lt100ms);
TRAC_INFO("| Mem Temperatures %-5d ticks %-5d ticks %-5d %-5d %-5d",
G_gpu_tick_times.memtemp[i].max,
G_gpu_tick_times.memtemp[i].avg,
G_gpu_tick_times.memtemp[i].count_1s,
G_gpu_tick_times.memtemp[i].count_100ms,
G_gpu_tick_times.memtemp[i].count_lt100ms);
TRAC_INFO("| Check Driver Loaded %-5d ticks %-5d ticks %-5d %-5d %-5d",
G_gpu_tick_times.checkdriver[i].max,
G_gpu_tick_times.checkdriver[i].avg,
G_gpu_tick_times.checkdriver[i].count_1s,
G_gpu_tick_times.checkdriver[i].count_100ms,
G_gpu_tick_times.checkdriver[i].count_lt100ms);
TRAC_INFO("| Mem Capabilities %-5d ticks %-5d ticks %-5d %-5d %-5d",
G_gpu_tick_times.capabilities[i].max,
G_gpu_tick_times.capabilities[i].avg,
G_gpu_tick_times.capabilities[i].count_1s,
G_gpu_tick_times.capabilities[i].count_100ms,
G_gpu_tick_times.capabilities[i].count_lt100ms);
TRAC_INFO("| Read Power Policy %-5d ticks %-5d ticks %-5d %-5d %-5d",
G_gpu_tick_times.getpcap[i].max,
G_gpu_tick_times.getpcap[i].avg,
G_gpu_tick_times.getpcap[i].count_1s,
G_gpu_tick_times.getpcap[i].count_100ms,
G_gpu_tick_times.getpcap[i].count_lt100ms);
TRAC_INFO("| Set Power Cap %-5d ticks %-5d ticks %-5d %-5d %-5d",
G_gpu_tick_times.setpcap[i].max,
G_gpu_tick_times.setpcap[i].avg,
G_gpu_tick_times.setpcap[i].count_1s,
G_gpu_tick_times.setpcap[i].count_100ms,
G_gpu_tick_times.setpcap[i].count_lt100ms);
TRAC_INFO("==============================================================================================", i);
}
}

// Function Specification
//
// Name: dbug_parse_cmd
Expand Down Expand Up @@ -1458,6 +1509,10 @@ void cmdh_dbug_cmd (const cmdh_fsp_cmd_t * i_cmd_ptr,
// Act on Debug Sub-Command
switch ( l_sub_cmd )
{
case DBUG_DUMP_GPU_TIMINGS:
cmdh_dump_gpu_timings();
break;

case DBUG_GET_AME_SENSOR:
cmdh_dbug_get_ame_sensor(i_cmd_ptr, o_rsp_ptr);
break;
Expand Down
2 changes: 1 addition & 1 deletion src/occ_405/cmdh/cmdh_fsp_cmds.h
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ typedef enum
// free = 0x05
DBUG_SET_PEXE_EVENT = 0x06,
DBUG_GET_AME_SENSOR = 0x07,
// free = 0x08,
DBUG_DUMP_GPU_TIMINGS = 0x08,
DBUG_PEEK = 0x09,
DBUG_POKE = 0x0A,
DBUG_DUMP_THEMAL = 0x0B,
Expand Down
73 changes: 71 additions & 2 deletions src/occ_405/gpu/gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
#define GPU_TEMP_READ_1S ( 1000000 / (MICS_PER_TICK * 2) )
#define GPU_TIMEOUT ( 5000000 / (MICS_PER_TICK *2) )

#define GPU_TICKS_TO_100MS ( 100000 / (MICS_PER_TICK * 2) )
#define GPU_TICKS_TO_1S ( 1000000 / (MICS_PER_TICK * 2) )

// Number of consecutive failures to ignore after GPU is taken out of reset to give GPU init time
#define GPU_INIT_ERROR_COUNT 300 // approximately 300 seconds

Expand Down Expand Up @@ -93,6 +96,33 @@ gpu_sm_args_t G_new_gpu_req_args = {{{{0}}}};

uint8_t G_current_gpu_id = 0; // ID 0..2 of GPU currently being processed

gpuTimingTable_t G_gpu_tick_times;

void update_gpu_tick_sensor(gpuTimingSensor_t *sensor, uint32_t ticks)
{
if(ticks > sensor->max)
{
sensor->max = ticks;
}

if(ticks > GPU_TICKS_TO_1S)
{
sensor->count_1s++;
}
else if( (ticks > GPU_TICKS_TO_100MS) )
{
sensor->count_100ms++;
}
else
{
sensor->count_lt100ms++;
}

sensor->count++;
sensor->accum += ticks;
sensor->avg = sensor->accum / sensor->count;
}

// Find first present non-failed GPU. returns 0xFF if no GPUs present/functional
uint8_t get_first_gpu(void)
{
Expand Down Expand Up @@ -1085,6 +1115,10 @@ bool gpu_check_driver_loaded_sm()
static gpuCheckDriverLoadedState_e L_check_driver_state = GPU_STATE_CHECK_DRIVER_LOADED_NEW;
static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};

static uint32_t L_num_ticks = 0;

L_num_ticks++;

if (async_request_is_idle(&G_gpu_op_request.request))
{
// If not starting a new read then need to check status of current state before moving on
Expand Down Expand Up @@ -1186,6 +1220,7 @@ bool gpu_check_driver_loaded_sm()
switch (L_check_driver_state)
{
case GPU_STATE_CHECK_DRIVER_LOADED_START:
L_num_ticks = 1;
L_scheduled = schedule_gpu_req(GPU_REQ_CHECK_DRIVER_START, G_new_gpu_req_args);
break;

Expand All @@ -1202,6 +1237,9 @@ bool gpu_check_driver_loaded_sm()
break;

case GPU_STATE_CHECK_DRIVER_LOADED_COMPLETE:
// Update GPU tick timing table
update_gpu_tick_sensor(&G_gpu_tick_times.checkdriver[G_current_gpu_id], L_num_ticks);

// Update driver loaded
l_new_driver_loaded = G_gpu_op_req_args.data[0] & 0x01;
if(l_new_driver_loaded != g_amec->gpu[G_current_gpu_id].status.driverLoaded)
Expand Down Expand Up @@ -1292,6 +1330,10 @@ bool gpu_read_pwr_limit_sm()
static uint32_t L_last_min[MAX_NUM_GPU_PER_DOMAIN] = {0};
static uint32_t L_last_max[MAX_NUM_GPU_PER_DOMAIN] = {0};

static uint32_t L_num_ticks = 0;

L_num_ticks++;

if (async_request_is_idle(&G_gpu_op_request.request))
{
// If not starting a new read then need to check status of current state before moving on
Expand Down Expand Up @@ -1409,6 +1451,7 @@ bool gpu_read_pwr_limit_sm()
{
// Step 1
case GPU_STATE_READ_PWR_LIMIT_1_START:
L_num_ticks = 1;
L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_1_START, G_new_gpu_req_args);
break;

Expand Down Expand Up @@ -1440,7 +1483,7 @@ bool gpu_read_pwr_limit_sm()

// Step 3
case GPU_STATE_READ_PWR_LIMIT_3_START:
GPU_DBG("gpu_read_pwr_limit_sm: took %d ticks to finish read pcap", L_attempts);
GPU_DBG("gpu_read_pwr_limit_sm: took %d ticks to finish read pcap for GPU%d", L_attempts, G_current_gpu_id);
L_scheduled = schedule_gpu_req(GPU_REQ_GET_PWR_LIMIT_3_START, G_new_gpu_req_args);
break;

Expand Down Expand Up @@ -1494,6 +1537,8 @@ bool gpu_read_pwr_limit_sm()
break;

case GPU_STATE_READ_PWR_LIMIT_COMPLETE:
update_gpu_tick_sensor(&G_gpu_tick_times.getpcap[G_current_gpu_id], L_num_ticks);

g_amec->gpu[G_current_gpu_id].pcap.check_pwr_limit = FALSE;
// Update power limits
g_amec->gpu[G_current_gpu_id].pcap.pwr_limits_read = TRUE;
Expand Down Expand Up @@ -1568,6 +1613,10 @@ bool gpu_set_pwr_limit_sm()

static uint32_t L_last_pcap[MAX_NUM_GPU_PER_DOMAIN] = {0};

static uint32_t L_num_ticks = 0;

L_num_ticks++;

if (async_request_is_idle(&G_gpu_op_request.request))
{
// If not starting a new set limit then need to check status of current state before moving on
Expand Down Expand Up @@ -1686,6 +1735,7 @@ bool gpu_set_pwr_limit_sm()
{
// Step 1
case GPU_STATE_SET_PWR_LIMIT_1_START:
L_num_ticks = 1;
L_scheduled = schedule_gpu_req(GPU_REQ_SET_PWR_LIMIT_1_START, G_new_gpu_req_args);
break;

Expand Down Expand Up @@ -1754,7 +1804,8 @@ bool gpu_set_pwr_limit_sm()
break;

case GPU_STATE_SET_PWR_LIMIT_COMPLETE:
GPU_DBG("gpu_set_pwr_limit_sm: took %d ticks to finish setting pcap", L_attempts);
update_gpu_tick_sensor(&G_gpu_tick_times.setpcap[G_current_gpu_id], L_num_ticks);
GPU_DBG("gpu_set_pwr_limit_sm: took %d ticks to finish setting pcap for GPU%d", L_attempts, G_current_gpu_id);
// Update the requested power limit since it was successfully sent
// NOTE: want this value to be sent back from the GPE to know what was set in case AMEC
// has caluclated a new desired pcap while this one was already in process of being set
Expand Down Expand Up @@ -1818,6 +1869,10 @@ bool gpu_read_temp_sm()
static bool L_trace_success = FALSE;
static gpuReadTempState_e L_read_temp_state = GPU_STATE_READ_TEMP_NEW; // 1st state for reading temp

static uint32_t L_num_ticks = 0;

L_num_ticks++;

if (async_request_is_idle(&G_gpu_op_request.request))
{
// If not starting a new read then need to check status of current state before moving on
Expand Down Expand Up @@ -1852,6 +1907,7 @@ bool gpu_read_temp_sm()
switch (L_read_temp_state)
{
case GPU_STATE_READ_TEMP_START:
L_num_ticks = 1;
L_scheduled = schedule_gpu_req(GPU_REQ_READ_TEMP_START, G_new_gpu_req_args);
break;

Expand All @@ -1860,6 +1916,7 @@ bool gpu_read_temp_sm()
break;

case GPU_STATE_READ_TEMP_COMPLETE:
update_gpu_tick_sensor(&G_gpu_tick_times.coretemp[G_current_gpu_id], L_num_ticks);
if( (!g_amec->gpu[G_current_gpu_id].status.readOnce) &&
(0 != G_gpu_op_req_args.data[0]) )
{
Expand Down Expand Up @@ -1987,6 +2044,10 @@ bool gpu_read_mem_temp_capability_sm()
static gpuReadMemTempCapableState_e L_read_cap_state = GPU_STATE_READ_MEM_TEMP_CAPABLE_NEW;
static bool L_error_logged[MAX_NUM_GPU_PER_DOMAIN] = {FALSE};

static uint32_t L_num_ticks = 0;

L_num_ticks++;

if (async_request_is_idle(&G_gpu_op_request.request))
{
// If not starting a new read then need to check status of current state before moving on
Expand Down Expand Up @@ -2087,6 +2148,7 @@ bool gpu_read_mem_temp_capability_sm()
switch (L_read_cap_state)
{
case GPU_STATE_READ_MEM_TEMP_CAPABLE_START:
L_num_ticks = 1;
L_scheduled = schedule_gpu_req(GPU_REQ_READ_CAPS_START, G_new_gpu_req_args);
break;

Expand All @@ -2103,6 +2165,7 @@ bool gpu_read_mem_temp_capability_sm()
break;

case GPU_STATE_READ_MEM_TEMP_CAPABLE_COMPLETE:
update_gpu_tick_sensor(&G_gpu_tick_times.capabilities[G_current_gpu_id], L_num_ticks);
// Update capability
g_amec->gpu[G_current_gpu_id].status.memTempSupported = G_gpu_op_req_args.data[0] & 0x01;

Expand Down Expand Up @@ -2168,6 +2231,10 @@ bool gpu_read_memory_temp_sm()
static uint8_t L_read_failure_count = 0;
static gpuReadMemTempState_e L_read_temp_state = GPU_STATE_READ_MEM_TEMP_NEW; // 1st state for reading temp

static uint32_t L_num_ticks = 0;

L_num_ticks++;

if (async_request_is_idle(&G_gpu_op_request.request))
{
// If not starting a new read then need to check status of current state before moving on
Expand Down Expand Up @@ -2281,6 +2348,7 @@ bool gpu_read_memory_temp_sm()
switch (L_read_temp_state)
{
case GPU_STATE_READ_MEM_TEMP_START:
L_num_ticks = 1;
L_scheduled = schedule_gpu_req(GPU_REQ_READ_MEM_TEMP_START, G_new_gpu_req_args);
break;

Expand All @@ -2297,6 +2365,7 @@ bool gpu_read_memory_temp_sm()
break;

case GPU_STATE_READ_MEM_TEMP_COMPLETE:
update_gpu_tick_sensor(&G_gpu_tick_times.memtemp[G_current_gpu_id], L_num_ticks);
// Update sensor
l_temp = G_gpu_op_req_args.data[0];
sensor_update(AMECSENSOR_PTR(TEMPGPU0MEM + G_current_gpu_id), l_temp);
Expand Down
21 changes: 21 additions & 0 deletions src/occ_405/gpu/gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,5 +156,26 @@ void gpu_ipc_init();
// GPU state machine
void task_gpu_sm(struct task *i_self);

typedef struct gpuTimingSensor
{
uint32_t max;
uint32_t avg;
uint32_t count_1s;
uint32_t count_100ms;
uint32_t count_lt100ms;
uint64_t accum;
uint64_t count;
} gpuTimingSensor_t;

// Table for GPU timings
typedef struct gpuTimingTable
{
gpuTimingSensor_t getpcap[MAX_NUM_GPU_PER_DOMAIN];
gpuTimingSensor_t setpcap[MAX_NUM_GPU_PER_DOMAIN];
gpuTimingSensor_t coretemp[MAX_NUM_GPU_PER_DOMAIN];
gpuTimingSensor_t memtemp[MAX_NUM_GPU_PER_DOMAIN];
gpuTimingSensor_t capabilities[MAX_NUM_GPU_PER_DOMAIN];
gpuTimingSensor_t checkdriver[MAX_NUM_GPU_PER_DOMAIN];
} gpuTimingTable_t;

#endif //_GPU_H

0 comments on commit bacb45a

Please sign in to comment.