Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

bug 836425 - nagios alerts introspection, r=rhelmer

  • Loading branch information...
commit fd35afc6aa9f537c7475c2de8163640887274fc7 1 parent 69c470e
@peterbe peterbe authored
View
38 docs/crontabber.rst
@@ -260,6 +260,44 @@ This will attempt it again and no matter if it works or errors it will
pick up the frequency from the configuration and update what time it
will run next.
+Resetting a job
+---------------
+
+If you want to pretend that a job has never run before you can use the
+``--reset`` switch. It expects the name of the app. Like this::
+
+ python socorro/cron/crontabber.py --reset=my-app-name
+
+That's going to wipe that job out of the state database rendering
+basically as if it's never run before. That can make this tool useful
+for bootstrapping new apps that don't work on the first run or you
+know what you're doing and you just want it to start afresh.
+
+Nagios monitoring
+-----------------
+
+To hook up crontabber to Nagios monitoring as an NRPE plugin you can
+use the ``--nagios`` switch like this::
+
+ python socorro/cron/crontabber.py --nagios
+
+What this will do is the following:
+
+1. If there are no recorded errors in any app, exit with code 0 and no
+ message.
+
+2. If an app has exactly 1 error count, then:
+
+ 1. If it's backfill based (meaning it should hopefully self-heal) it
+ will exit with code 1 and a message to ``stdout`` that starts with
+ the word ``WARNING`` and also prints the name of the app, the name
+ of the class, the exception type and the exception value.
+
+ 2. If it's **not** a backfill based app, it will exit with code 3 and a
+ message on ``stdout`` starting with the word ``CRITICAL`` followed
+ by the name of the app, the name of the class, the exception type
+ and the exception value.
+
Frequency and execution time
----------------------------
View
72 socorro/cron/crontabber.py
@@ -21,7 +21,11 @@
from socorro.app.generic_app import App, main
from socorro.lib.datetimeutil import utc_now, UTC
-from socorro.cron.base import convert_frequency, FrequencyDefinitionError
+from socorro.cron.base import (
+ convert_frequency,
+ FrequencyDefinitionError,
+ BaseBackfillCronApp
+)
DEFAULT_JOBS = '''
@@ -480,11 +484,22 @@ class CronTabber(App):
exclude_from_dump_conf=True,
)
+ required_config.add_option(
+ name='nagios',
+ default=False,
+ doc='Exits with 0, 1 or 2 with a message on stdout if errors have '
+ 'happened.',
+ short_form='n',
+ exclude_from_print_conf=True,
+ exclude_from_dump_conf=True,
+ )
def main(self):
if self.config.get('list-jobs'):
self.list_jobs()
- if self.config.get('reset-job'):
+ elif self.config.get('nagios'):
+ return self.nagios()
+ elif self.config.get('reset-job'):
self.reset_job(self.config.get('reset-job'))
elif self.config.get('job'):
self.run_one(self.config['job'], self.config.get('force'))
@@ -504,6 +519,56 @@ def database(self):
self._database.load(self.config.crontabber.database_file)
return self._database
+ def nagios(self, stream=sys.stdout):
+ """
+ return 0 (OK) if there are no errors in the state.
+ return 1 (WARNING) if a backfill app only has 1 error.
+ return 2 (CRITICAL) if a backfill app has > 1 error.
+ return 2 (CRITICAL) if a non-backfill app has 1 error.
+ """
+ warnings = []
+ criticals = []
+ for class_name, job_class in self.config.crontabber.jobs.class_list:
+ if job_class.app_name in self.database:
+ info = self.database.get(job_class.app_name)
+ if not info.get('error_count', 0):
+ continue
+ error_count = info['error_count']
+ # trouble!
+ serialized = (
+ '%s (%s) | %s | %s' %
+ (job_class.app_name,
+ class_name,
+ info['last_error']['type'],
+ info['last_error']['value'])
+ )
+ if (
+ error_count == 1 and
+ issubclass(job_class, BaseBackfillCronApp)
+ ):
+ # just a warning for now
+ warnings.append(serialized)
+ else:
+ # anything worse than that is critical
+ criticals.append(serialized)
+
+ if criticals:
+ stream.write('CRITICAL - ')
+ for each in criticals:
+ stream.write(each)
+ stream.write('\n')
+ elif warnings:
+ stream.write('WARNING - ')
+ for each in warnings:
+ stream.write(each)
+ stream.write('\n')
+
+ if criticals:
+ return 2
+ elif warnings:
+ return 1
+ return 0
+
def list_jobs(self, stream=None):
if not stream:
stream = sys.stdout
@@ -558,9 +623,7 @@ def reset_job(self, description):
job_class.app_name == description or
description == job_class.__module__ + '.' + job_class.__name__
):
- class_config = self.config.crontabber['class-%s' % class_name]
if job_class.app_name in self.database:
- info = self.database.get(job_class.app_name)
self.config.logger.info('App reset')
self.database.pop(job_class.app_name)
self.database.save(self.config.crontabber.database_file)
@@ -569,7 +632,6 @@ def reset_job(self, description):
return
raise JobNotFoundError(description)
-
def run_all(self):
for class_name, job_class in self.config.crontabber.jobs.class_list:
class_config = self.config.crontabber['class-%s' % class_name]
View
75 socorro/unittest/cron/test_crontabber.py
@@ -1274,6 +1274,73 @@ def test_reset_job(self):
db.load(json_file)
self.assertTrue('basic-job' not in db)
+ def test_nagios_ok(self):
+ config_manager, json_file = self._setup_config_manager(
+ 'socorro.unittest.cron.test_crontabber.BasicJob|1d\n'
+ 'socorro.unittest.cron.test_crontabber.FooJob|1d'
+ )
+ with config_manager.context() as config:
+ tab = crontabber.CronTabber(config)
+ tab.run_all()
+ stream = StringIO()
+ exit_code = tab.nagios(stream=stream)
+ self.assertEqual(exit_code, 0)
+ self.assertEqual(stream.getvalue(), '')
+
+ def test_nagios_warning(self):
+ config_manager, json_file = self._setup_config_manager(
+ 'socorro.unittest.cron.test_crontabber.BasicJob|1d\n'
+ 'socorro.unittest.cron.test_crontabber.BackfillbasedTrouble|1d'
+ )
+ with config_manager.context() as config:
+ tab = crontabber.CronTabber(config)
+ tab.run_all()
+ stream = StringIO()
+ exit_code = tab.nagios(stream=stream)
+ self.assertEqual(exit_code, 1)
+ output = stream.getvalue()
+ self.assertTrue('WARNING' in output)
+ self.assertTrue('backfill-trouble' in output)
+ self.assertTrue('BackfillbasedTrouble' in output)
+ self.assertTrue('NameError' in output)
+ self.assertTrue('bla bla' in output)
+
+ # run it a second time
+ # wind the clock forward
+ self._wind_clock(json_file, days=1)
+
+ # this forces in crontabber instance to reload the JSON file
+ tab._database = None
+
+ tab.run_all()
+ stream = StringIO()
+ exit_code = tab.nagios(stream=stream)
+ self.assertEqual(exit_code, 2)
+ output = stream.getvalue()
+ self.assertTrue('CRITICAL' in output)
+ self.assertTrue('backfill-trouble' in output)
+ self.assertTrue('BackfillbasedTrouble' in output)
+ self.assertTrue('NameError' in output)
+ self.assertTrue('bla bla' in output)
+
+ def test_nagios_critical(self):
+ config_manager, json_file = self._setup_config_manager(
+ 'socorro.unittest.cron.test_crontabber.BasicJob|1d\n'
+ 'socorro.unittest.cron.test_crontabber.TroubleJob|1d'
+ )
+ with config_manager.context() as config:
+ tab = crontabber.CronTabber(config)
+ tab.run_all()
+ stream = StringIO()
+ exit_code = tab.nagios(stream=stream)
+ self.assertEqual(exit_code, 2)
+ output = stream.getvalue()
+ self.assertTrue('CRITICAL' in output)
+ self.assertTrue('trouble' in output)
+ self.assertTrue('TroubleJob' in output)
+ self.assertTrue('NameError' in output)
+ self.assertTrue('Trouble!!' in output)
+
#==============================================================================
@attr(integration='postgres') # for nosetests
@@ -1538,7 +1605,6 @@ def run(self):
super(BasicJob, self).run()
-
class FooJob(_Job):
app_name = 'foo'
@@ -1629,6 +1695,13 @@ class FooBackfillJob(_BackfillJob):
app_name = 'foo-backfill'
+class BackfillbasedTrouble(_BackfillJob):
+ app_name = 'backfill-trouble'
+
+ def run(self, date):
+ raise NameError('bla bla')
+
+
class CertainDayHaterBackfillJob(_BackfillJob):
app_name = 'certain-day-hater-backfill'
Please sign in to comment.
Something went wrong with that request. Please try again.