Skip to content

Commit

Permalink
rasdaemon: add support for memory Corrected Error predictive failure …
Browse files Browse the repository at this point in the history
…analysis

Memory Corrected Error was corrected by hardware. These errors do not
require immediate software actions, but are still reported for
accounting and predictive failure analysis.

Based on statistical results, some actions can be taken to prevent
Corrected Error from evoluting to Uncorrected Error.

Signed-off-by: wuyun <wuyun.wu@huawei.com>
Signed-off-by: lvying6 <lvying6@huawei.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
  • Loading branch information
wuyun authored and mchehab committed Jul 21, 2020
1 parent 5fd96f4 commit 9ae6b70
Show file tree
Hide file tree
Showing 11 changed files with 466 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ before_install:
- sudo apt-get install -y sqlite3
install:
- autoreconf -vfi
- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-hisi-ns-decode
- ./configure --enable-sqlite3 --enable-aer --enable-non-standard --enable-arm --enable-mce --enable-extlog --enable-devlink --enable-diskerror --enable-abrt-report --enable-hisi-ns-decode --enable-memory-ce-pfa

script:
- make && sudo make install
Expand Down
5 changes: 4 additions & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,15 @@ endif
if WITH_HISI_NS_DECODE
rasdaemon_SOURCES += non-standard-hisi_hip07.c non-standard-hisi_hip08.c
endif
if WITH_MEMORY_CE_PFA
rasdaemon_SOURCES += rbtree.c ras-page-isolation.c
endif
rasdaemon_LDADD = -lpthread $(SQLITE3_LIBS) libtrace/libtrace.a

include_HEADERS = config.h ras-events.h ras-logger.h ras-mc-handler.h \
ras-aer-handler.h ras-mce-handler.h ras-record.h bitfield.h ras-report.h \
ras-extlog-handler.h ras-arm-handler.h ras-non-standard-handler.h \
ras-devlink-handler.h ras-diskerror-handler.h
ras-devlink-handler.h ras-diskerror-handler.h rbtree.h ras-page-isolation.h

# This rule can't be called with more than one Makefile job (like make -j8)
# I can't figure out a way to fix that
Expand Down
11 changes: 11 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,16 @@ AS_IF([test "x$enable_hisi_ns_decode" = "xyes" || test "x$enable_all" == "xyes"]
AM_CONDITIONAL([WITH_HISI_NS_DECODE], [test x$enable_hisi_ns_decode = xyes || test x$enable_all == xyes])
AM_COND_IF([WITH_HISI_NS_DECODE], [USE_HISI_NS_DECODE="yes"], [USE_HISI_NS_DECODE="no"])

AC_ARG_ENABLE([memory_ce_pfa],
AS_HELP_STRING([--enable-memory-ce-pfa], [enable memory Corrected Error predictive failure analysis]))

AS_IF([test "x$enable_memory_ce_pfa" = "xyes" || test "x$enable_all" == "xyes"], [
AC_DEFINE(HAVE_MEMORY_CE_PFA,1,"have memory corrected error predictive failure analysis")
AC_SUBST([WITH_MEMORY_CE_PFA])
])
AM_CONDITIONAL([WITH_MEMORY_CE_PFA], [test x$enable_memory_ce_pfa = xyes || test x$enable_all == xyes])
AM_COND_IF([WITH_MEMORY_CE_PFA], [USE_MEMORY_CE_PFA="yes"], [USE_MEMORY_CE_PFA="no"])

test "$sysconfdir" = '${prefix}/etc' && sysconfdir=/etc

CFLAGS="$CFLAGS -Wall -Wmissing-prototypes -Wstrict-prototypes"
Expand Down Expand Up @@ -162,4 +172,5 @@ compile time options summary
ARM events : $USE_ARM
DEVLINK : $USE_DEVLINK
Disk I/O errors : $USE_DISKERROR
Memory CE PFA : $USE_MEMORY_CE_PFA
EOF
7 changes: 7 additions & 0 deletions man/rasdaemon.1.in
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ feature.
.BI "--version"
Print the program version and exit.

.SH CONFIG FILE

The \fBrasdaemon\fR program supports a config file to set rasdaemon systemd service
environment variables. By default the config file is read from /etc/sysconfig/rasdaemon.

The general format is environmentname=value.

.SH SEE ALSO
\fBras-mc-ctl\fR(8)

29 changes: 29 additions & 0 deletions misc/rasdaemon.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Page Isolation
# Note: Run-time configuration is unsupported, service restart needed.
# Note: this file should be installed at /etc/sysconfig/rasdaemon

# Specify the threshold of isolating buggy pages.
#
# Format:
# [0-9]+[unit]
# Notice: please make sure match this format, rasdaemon will use default value for exception input cases.
#
# Supported units:
# PAGE_CE_REFRESH_CYCLE: D|d (day), H|h (hour), M|m (min), default is in hour
# PAGE_CE_THRESHOLD: K|k (x1000), M|m (x1000k), default is none
#
# The two configs will only take no effect when PAGE_CE_ACTION is "off".
PAGE_CE_REFRESH_CYCLE="24h"
PAGE_CE_THRESHOLD="50"

# Specify the internal action in rasdaemon to exceeding a page error threshold.
#
# off no action
# account only account errors
# soft try to soft-offline page without killing any processes
# This requires an uptodate kernel. Might not be successfull.
# hard try to hard-offline page by killing processes
# Requires an uptodate kernel. Might not be successfull.
# soft-then-hard First try to soft offline, then try hard offlining.
# Note: default offline choice is "soft".
PAGE_CE_ACTION="soft"
1 change: 1 addition & 0 deletions misc/rasdaemon.service.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ Description=RAS daemon to log the RAS events
After=syslog.target

[Service]
EnvironmentFile=/etc/sysconfig/rasdaemon
ExecStart=@sbindir@/rasdaemon -f -r
ExecStartPost=@sbindir@/rasdaemon --enable
ExecStop=@sbindir@/rasdaemon --disable
Expand Down
2 changes: 2 additions & 0 deletions misc/rasdaemon.spec.in
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ make %{?_smp_mflags}

%install
make install DESTDIR=%{buildroot}
install -D -p -m 0644 misc/rasdaemon.env %{buildroot}%{_sysconfdir}/sysconfig/%{name}
install -D -p -m 0644 misc/rasdaemon.service %{buildroot}/%{_unitdir}/rasdaemon.service
install -D -p -m 0644 misc/ras-mc-ctl.service %{buildroot}%{_unitdir}/ras-mc-ctl.service
rm INSTALL %{buildroot}/usr/include/*.h
Expand All @@ -56,6 +57,7 @@ rm INSTALL %{buildroot}/usr/include/*.h
%{_unitdir}/*.service
%{_sharedstatedir}/rasdaemon
%{_sysconfdir}/ras/dimm_labels.d
%config(noreplace) %{_sysconfdir}/sysconfig/%{name}

%changelog

Expand Down
6 changes: 6 additions & 0 deletions ras-events.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "ras-diskerror-handler.h"
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-page-isolation.h"

/*
* Polling time, if read() doesn't block. Currently, trace_pipe_raw never
Expand Down Expand Up @@ -803,6 +804,11 @@ int handle_ras_events(int record_events)
ras->page_size = page_size;
ras->record_events = record_events;

#ifdef HAVE_MEMORY_CE_PFA
/* FIXME: enable memory isolation unconditionally */
ras_page_account_init();
#endif

rc = add_event_handler(ras, pevent, page_size, "ras", "mc_event",
ras_mc_event_handler, NULL, MC_EVENT);
if (!rc)
Expand Down
7 changes: 7 additions & 0 deletions ras-mc-handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "ras-mc-handler.h"
#include "ras-record.h"
#include "ras-logger.h"
#include "ras-page-isolation.h"
#include "ras-report.h"

int ras_mc_event_handler(struct trace_seq *s,
Expand Down Expand Up @@ -183,6 +184,12 @@ int ras_mc_event_handler(struct trace_seq *s,

ras_store_mc_event(ras, &ev);

#ifdef HAVE_MEMORY_CE_PFA
/* Account page corrected errors */
if (!strcmp(ev.error_type, "Corrected"))
ras_record_page_error(ev.address, ev.error_count, now);
#endif

#ifdef HAVE_ABRT_REPORT
/* Report event to ABRT */
ras_report_mc_event(ras, &ev);
Expand Down

0 comments on commit 9ae6b70

Please sign in to comment.