Skip to content

Commit

Permalink
rasdaemon: Add support for post-processing MCA errors
Browse files Browse the repository at this point in the history
Currently, the rasdaemon performs detailed error decoding of received
MCA errors on the system only whence it is running, either as a daemon
or in the foreground.

As such, error decoding cannot be undertaken for any MCA errors received
whence the rasdaemon wasn't running. Additionally, if the error decoding
modules like edac_mce_amd too have not been loaded, error records in the
demsg buffer might correspond to raw values in associated MSRs, compelling
users to undertake decoding manually. The scenario seems more plausible on
AMD systems with Scalabale MCA (SMCA) with plans in place to remove SMCA
Extended Error Descriptions from the edac_mce_amd module in an effort to
offload SMCA Error Decoding to the rasdaemon.

As such, add support to post-process and decode MCA Errors received on AMD
SMCA systems from raw MSR values. Support for post-processing and decoding
of MCA Errors received on CPUs of other vendors can be added in the future,
as needed.

Suggested-by: Yazen Ghannam <yazen.ghannam@amd.com>
Signed-off-by: Avadhut Naik <avadhut.naik@amd.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
  • Loading branch information
Avadhut Naik authored and mchehab committed Oct 23, 2023
1 parent aa36c96 commit 932118b
Show file tree
Hide file tree
Showing 7 changed files with 226 additions and 11 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,16 @@ required):
# rasdaemon -f -r
```

To post-process and decode received MCA errors on AMD SMCA systems, run:

```
# rasdaemon -p --status <STATUS_reg> --ipid <IPID_reg> --smca --family <CPU Family> --model <CPU Model> --bank <BANK_NUM>
```

Status and IPID Register values (in hex) are mandatory. The `smca` flag
with `family` and `model` are required if not decoding locally. `Bank`
parameter is optional.

You may also start it via systemd:

```
Expand Down
8 changes: 5 additions & 3 deletions mce-amd-smca.c
Original file line number Diff line number Diff line change
Expand Up @@ -710,7 +710,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_GMI_PHY] = { "Global Memory Interconnect PHY Unit" },
};

static void amd_decode_errcode(struct mce_event *e)
void amd_decode_errcode(struct mce_event *e)
{

decode_amd_errcode(e);
Expand Down Expand Up @@ -782,7 +782,7 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
}

/* Decode extended errors according to Scalable MCA specification */
static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
void decode_smca_error(struct mce_event *e, struct mce_priv *m)
{
enum smca_bank_types bank_type;
const char *ip_name;
Expand Down Expand Up @@ -827,7 +827,9 @@ static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
/* Only print the descriptor of valid extended error code */
if (xec < smca_mce_descs[bank_type].num_descs)
mce_snprintf(e->mcastatus_msg,
" %s.\n", smca_mce_descs[bank_type].descs[xec]);
"%s. Ext Err Code: %d",
smca_mce_descs[bank_type].descs[xec],
xec);

if (bank_type == SMCA_UMC && xec == 0) {
channel = find_umc_channel(e);
Expand Down
1 change: 1 addition & 0 deletions ras-events.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ enum ghes_severity {

/* Function prototypes */
int toggle_ras_mc_event(int enable);
int ras_offline_mce_event(struct ras_mc_offline_event *event);
int handle_ras_events(int record_events);

#endif
110 changes: 103 additions & 7 deletions ras-mce-handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,8 @@ static char *cputype_name[] = {
[CPU_SAPPHIRERAPIDS] = "Sapphirerapids server",
};

static enum cputype select_intel_cputype(struct ras_events *ras)
static enum cputype select_intel_cputype(struct mce_priv *mce)
{
struct mce_priv *mce = ras->mce_priv;

if (mce->family == 15) {
if (mce->model == 6)
return CPU_TULSA;
Expand Down Expand Up @@ -140,9 +138,8 @@ static enum cputype select_intel_cputype(struct ras_events *ras)
return mce->family == 6 ? CPU_P6OLD : CPU_GENERIC;
}

static int detect_cpu(struct ras_events *ras)
static int detect_cpu(struct mce_priv *mce)
{
struct mce_priv *mce = ras->mce_priv;
FILE *f;
int ret = 0;
char *line = NULL;
Expand Down Expand Up @@ -221,7 +218,7 @@ static int detect_cpu(struct ras_events *ras)
}
goto ret;
} else if (!strcmp(mce->vendor,"GenuineIntel")) {
mce->cputype = select_intel_cputype(ras);
mce->cputype = select_intel_cputype(mce);
} else {
ret = EINVAL;
}
Expand All @@ -246,7 +243,7 @@ int register_mce_handler(struct ras_events *ras, unsigned ncpus)

mce = ras->mce_priv;

rc = detect_cpu(ras);
rc = detect_cpu(mce);
if (rc) {
if (mce->processor_flags)
free (mce->processor_flags);
Expand Down Expand Up @@ -383,6 +380,105 @@ static void report_mce_event(struct ras_events *ras,
*/
}

static int report_mce_offline(struct trace_seq *s,
struct mce_event *mce,
struct mce_priv *priv)
{
time_t now;
struct tm *tm;

time(&now);
tm = localtime(&now);

if (tm)
strftime(mce->timestamp, sizeof(mce->timestamp),
"%Y-%m-%d %H:%M:%S %z", tm);
trace_seq_printf(s, "%s,", mce->timestamp);

if (*mce->bank_name)
trace_seq_printf(s, " %s,", mce->bank_name);
else
trace_seq_printf(s, " bank=%x,", mce->bank);

if (*mce->mcastatus_msg)
trace_seq_printf(s, " mca: %s,", mce->mcastatus_msg);

if (*mce->mcistatus_msg)
trace_seq_printf(s, " mci: %s,", mce->mcistatus_msg);

if (*mce->mc_location)
trace_seq_printf(s, " Locn: %s,", mce->mc_location);

if (*mce->error_msg)
trace_seq_printf(s, " Error Msg: %s\n", mce->error_msg);

return 0;
}

int ras_offline_mce_event(struct ras_mc_offline_event *event)
{
int rc = 0;
struct trace_seq s;
struct mce_event *mce = NULL;
struct mce_priv *priv = NULL;

mce = (struct mce_event *)calloc(1, sizeof(struct mce_event));
if (!mce) {
log(TERM, LOG_ERR, "Can't allocate memory for mce struct\n");
return errno;
}

priv = (struct mce_priv *)calloc(1, sizeof(struct mce_priv));
if (!priv) {
log(TERM, LOG_ERR, "Can't allocate memory for mce_priv struct\n");
free(mce);
return errno;
}

if (event->smca) {
priv->cputype = CPU_AMD_SMCA;
priv->family = event->family;
priv->model = event->model;
} else {
rc = detect_cpu(priv);
if (rc) {
log(TERM, LOG_ERR, "Failed to detect CPU\n");
goto free_mce;
}
}

mce->status = event->status;
mce->bank = event->bank;

switch (priv->cputype) {
case CPU_AMD_SMCA:
mce->synd = event->synd;
mce->ipid = event->ipid;
if (!mce->ipid || !mce->status) {
log(TERM, LOG_ERR, "%s MSR required.\n",
mce->ipid ? "Status" : "Ipid");
rc = -EINVAL;
goto free_mce;
}
decode_smca_error(mce, priv);
amd_decode_errcode(mce);
break;
default:
break;
}

trace_seq_init(&s);
report_mce_offline(&s, mce, priv);
trace_seq_do_printf(&s);
fflush(stdout);
trace_seq_destroy(&s);

free_mce:
free(priv);
free(mce);
return rc;
}

int ras_mce_event_handler(struct trace_seq *s,
struct tep_record *record,
struct tep_event *event, void *context)
Expand Down
4 changes: 4 additions & 0 deletions ras-mce-handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,10 @@ int ras_mce_event_handler(struct trace_seq *s,
/* enables intel iMC logs */
int set_intel_imc_log(enum cputype cputype, unsigned ncpus);

/* Undertake AMD SMCA Error Decoding */
void decode_smca_error(struct mce_event *e, struct mce_priv *m);
void amd_decode_errcode(struct mce_event *e);

/* Per-CPU-type decoders for Intel CPUs */
void p4_decode_model(struct mce_event *e);
void core2_decode_model(struct mce_event *e);
Expand Down
10 changes: 10 additions & 0 deletions ras-record.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#define __RAS_RECORD_H

#include <stdint.h>
#include <stdbool.h>
#include "config.h"

#define ARRAY_SIZE(x) (sizeof(x)/sizeof(*(x)))
Expand All @@ -42,6 +43,15 @@ struct ras_mc_event {
const char *driver_detail;
};

struct ras_mc_offline_event {
unsigned int family, model;
bool smca;
uint8_t bank;
uint64_t ipid;
uint64_t synd;
uint64_t status;
};

struct ras_aer_event {
char timestamp[64];
const char *error_type;
Expand Down
94 changes: 93 additions & 1 deletion rasdaemon.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,21 @@ struct arguments {
int record_events;
int enable_ras;
int foreground;
int offline;
};

enum OFFLINE_ARG_KEYS {
SMCA = 0x100,
MODEL,
FAMILY,
BANK_NUM,
IPID_REG,
STATUS_REG,
SYNDROME_REG
};

struct ras_mc_offline_event event;

static error_t parse_opt(int k, char *arg, struct argp_state *state)
{
struct arguments *args = state->input;
Expand All @@ -62,25 +75,95 @@ static error_t parse_opt(int k, char *arg, struct argp_state *state)
case 'f':
args->foreground++;
break;
#ifdef HAVE_MCE
case 'p':
if (state->argc < 4)
argp_state_help(state, stdout, ARGP_HELP_LONG | ARGP_HELP_EXIT_ERR);
args->offline++;
break;
#endif
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}

#ifdef HAVE_MCE
static error_t parse_opt_offline(int key, char *arg,
struct argp_state *state)
{
switch (key) {
case SMCA:
event.smca = true;
break;
case MODEL:
event.model = strtoul(state->argv[state->next], NULL, 0);
break;
case FAMILY:
event.family = strtoul(state->argv[state->next], NULL, 0);
break;
case BANK_NUM:
event.bank = atoi(state->argv[state->next]);
break;
case IPID_REG:
event.ipid = strtoull(state->argv[state->next], NULL, 0);
break;
case STATUS_REG:
event.status = strtoull(state->argv[state->next], NULL, 0);
break;
case SYNDROME_REG:
event.synd = strtoull(state->argv[state->next], NULL, 0);
break;
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
#endif

long user_hz;

int main(int argc, char *argv[])
{
struct arguments args;
int idx = -1;

#ifdef HAVE_MCE
const struct argp_option offline_options[] = {
{"smca", SMCA, 0, 0, "AMD SMCA Error Decoding"},
{"model", MODEL, 0, 0, "CPU Model"},
{"family", FAMILY, 0, 0, "CPU Family"},
{"bank", BANK_NUM, 0, 0, "Bank Number"},
{"ipid", IPID_REG, 0, 0, "IPID Register (for SMCA systems only)"},
{"status", STATUS_REG, 0, 0, "Status Register"},
{"synd", SYNDROME_REG, 0, 0, "Syndrome Register"},
{0, 0, 0, 0, 0, 0},
};

struct argp offline_argp = {
.options = offline_options,
.parser = parse_opt_offline,
.doc = TOOL_DESCRIPTION,
.args_doc = ARGS_DOC,
};

struct argp_child offline_parser[] = {
{&offline_argp, 0, "Post-Processing Options:", 0},
{0, 0, 0, 0},
};
#endif

const struct argp_option options[] = {
{"enable", 'e', 0, 0, "enable RAS events and exit", 0},
{"disable", 'd', 0, 0, "disable RAS events and exit", 0},
#ifdef HAVE_SQLITE3
{"record", 'r', 0, 0, "record events via sqlite3", 0},
#endif
{"foreground", 'f', 0, 0, "run foreground, not daemonize"},
#ifdef HAVE_MCE
{"post-processing", 'p', 0, 0,
"Post-processing MCE's with raw register values"},
#endif

{ 0, 0, 0, 0, 0, 0 }
};
Expand All @@ -89,7 +172,9 @@ int main(int argc, char *argv[])
.parser = parse_opt,
.doc = TOOL_DESCRIPTION,
.args_doc = ARGS_DOC,

#ifdef HAVE_MCE
.children = offline_parser,
#endif
};
memset (&args, 0, sizeof(args));

Expand All @@ -111,6 +196,13 @@ int main(int argc, char *argv[])
return 0;
}

#ifdef HAVE_MCE
if (args.offline) {
ras_offline_mce_event(&event);
return 0;
}
#endif

openlog(TOOL_NAME, 0, LOG_DAEMON);
if (!args.foreground)
if (daemon(0,0))
Expand Down

0 comments on commit 932118b

Please sign in to comment.