Skip to content
Browse files

fixed an issue with the Pulse to redis code - the builduid is not as

  unique as I need it to be (and yes, Catlee had warned me ;) so instead
  of storing the build:<builduid> as the final hash I had to make that
  a set and then add build:<jobid> which has a key of builduid, master
  and buildnumber.
also checked in first pass of kittenmonitor.py - it looks for the last
  N hours of build:<yyyy-mm-dd>.<hh> activity and then breaks down that
  activity into a dashboard:<yyyy-mm-dd>.<hh> hash to be consumed by
  the bpDashboard code.  kittenmonitor also looks for certain thresholds
  and will email an alert when they are passed.
  • Loading branch information...
1 parent 95f3198 commit fe668e68d3ef1f6895be1a71ffb941dcb1634fd9 @bear bear committed Mar 16, 2012
Showing with 268 additions and 27 deletions.
  1. +4 −4 PulseBroker.py
  2. +24 −21 bpServer.py
  3. +1 −1 kittenherder.py
  4. +236 −0 kittenmonitor.py
  5. +3 −0 releng/__init__.py
  6. +0 −1 releng/remote.py
View
8 PulseBroker.py
@@ -68,7 +68,7 @@
def OfflineTest(options):
log.info('Starting Offline message testing')
- hArchive = open('test.in', 'r+')
+ hArchive = open(options.testfile, 'r+')
for msg in hArchive:
job = json.loads(msg)
@@ -316,15 +316,15 @@ def pushJob(job):
_defaultOptions = { 'config': ('-c', '--config', None, 'Configuration file'),
- 'debug': ('-d', '--debug', True, 'Enable Debug', 'b'),
+ 'debug': ('-d', '--debug', True, 'Enable Debug', 'b'),
'appinfo': ('-a', '--appinfo', appInfo, 'Mozilla Pulse app string'),
'background': ('-b', '--background', False, 'daemonize ourselves', 'b'),
'logpath': ('-l', '--logpath', None, 'Path where log file is to be written'),
'redis': ('-r', '--redis', 'localhost:6379', 'Redis connection string'),
'redisdb': ('', '--redisdb', '8', 'Redis database'),
'pulse': ('-p', '--pulse', None, 'Pulse connection string'),
'topic': ('-t', '--topic', '#', 'Mozilla Pulse Topic filter string'),
- 'test': ('', '--test', False, 'Offline testing, uses archive file instead of Pulse server', 'b'),
+ 'testfile': ('', '--testfile', None, 'Offline testing, uses named file instead of Pulse server'),
}
@@ -340,7 +340,7 @@ def pushJob(job):
log.info('Creating ZeroMQ handler')
Process(name='zmq', target=handleZMQ, args=(options, eventQueue, db)).start()
- if options.test:
+ if options.testfile:
OfflineTest(options)
else:
log.info('Connecting to Mozilla Pulse with topic "%s"' % options.topic)
View
45 bpServer.py
@@ -137,21 +137,21 @@ def worker(jobs, metrics, db, archivePath):
while True:
try:
- job = jobs.get(False)
+ entry = jobs.get(False)
except Empty:
- job = None
+ entry = None
- if job is not None:
+ if entry is not None:
try:
- item = json.loads(job)
+ item = json.loads(entry)
- event = item['event']
- key = item['pulse_key']
- master = item['master'].partition(':')[0].partition('.')[0]
- ts = item['time']
- jobKey = key.split('.')[1]
+ event = item['event']
+ key = item['pulse_key']
+ master = item['master'].partition(':')[0].partition('.')[0]
+ ts = item['time']
+ entryKey = key.split('.')[1]
- log.debug('Job: %s %s' % (event, key))
+ log.debug('Job: %s %s %s' % (event, key, ts))
outbound = [(METRICS_COUNT, ('metrics', 'pulse'))]
@@ -210,26 +210,28 @@ def worker(jobs, metrics, db, archivePath):
for p in item['pulse']['payload']['build']['properties']:
pName, pValue, _ = p
if pName in ('branch', 'product', 'platform', 'revision', 'builduid',
- 'build_url', 'pgo_build', 'scheduler', 'who'):
+ 'buildnumber', 'build_url', 'pgo_build', 'scheduler', 'who'):
properties[pName] = pValue
except:
log.error('exception extracting properties from build step', exc_info=True)
branch = properties['branch']
product = properties['product']
builduid = properties['builduid']
- buildKey = 'build:%s:%s' % (jobKey, builduid)
+ number = properties['buildnumber']
+ buildKey = 'build:%s' % builduid
+ jobKey = 'job:%s.%s.%s' % (builduid, master, number)
- db.hset(buildKey, 'slave', slave)
- db.hset(buildKey, 'master', master)
+ db.hset(jobKey, 'slave', slave)
+ db.hset(jobKey, 'master', master)
for p in properties:
- db.hset(buildKey, p, properties[p])
+ db.hset(jobKey, p, properties[p])
outbound.append((METRICS_COUNT, ('build', buildEvent)))
if buildEvent == 'started':
- db.hset(buildKey, 'started', ts)
+ db.hset(jobKey, 'started', ts)
outbound.append((METRICS_COUNT, ('build:started:slave', slave )))
outbound.append((METRICS_COUNT, ('build:started:master', master )))
@@ -242,30 +244,31 @@ def worker(jobs, metrics, db, archivePath):
outbound.append((METRICS_COUNT, ('build:finished:branch', branch )))
outbound.append((METRICS_COUNT, ('build:finished:product', product)))
- ts = db.hget(buildKey, 'started')
+ ts = db.hget(jobKey, 'started')
if ts is None:
ts = item['time']
- db.hset(buildKey, 'started', ts)
+ db.hset(jobKey, 'started', ts)
else:
dStarted = datetime.strptime(ts[:-6], '%Y-%m-%dT%H:%M:%S')
dFinished = datetime.strptime(item['time'][:-6], '%Y-%m-%dT%H:%M:%S')
tdElapsed = dFinished - dStarted
- db.hset(buildKey, 'finished', item['time'])
- db.hset(buildKey, 'elapsed', (tdElapsed.days * 86400) + tdElapsed.seconds)
+ db.hset(jobKey, 'finished', item['time'])
+ db.hset(jobKey, 'elapsed', (tdElapsed.days * 86400) + tdElapsed.seconds)
tsDate, tsTime = ts.split('T')
tsHour = tsTime[:2]
db.sadd('build:%s' % tsDate, buildKey)
db.sadd('build:%s.%s' % (tsDate, tsHour), buildKey)
+ db.sadd(buildKey, jobKey)
metrics.put(outbound)
except:
log.error('Error converting incoming job', exc_info=True)
if archive is not None:
- archive.write('%s\n' % job)
+ archive.write('%s\n' % entry)
aCount += 1
if aCount > ARCHIVE_CHUNK:
View
2 kittenherder.py
@@ -206,7 +206,7 @@ def writeCache(cachefile, cache):
options.tools = '/builds/tools'
if options.cachefile is None:
- options.cachefile = os.path.join(options.appPath, 'kittenwrangler_seen.dat')
+ options.cachefile = os.path.join(options.appPath, 'kittenherder_seen.dat')
if options.kittens is None:
options.kittens = urlNeedingReboot
View
236 kittenmonitor.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python
+
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this file,
+# You can obtain one at http://mozilla.org/MPL/2.0/.
+
+""" kittenmonitor.py
+
+ :copyright: (c) 2012 by Mozilla
+ :license: MPLv2
+
+ Assumes Python v2.6+
+
+ Usage
+ -c --config Configuration file (json format)
+ default: None
+ -r --redis Redis server connection string
+ default: localhost:6379
+ --redisdb Redis database ID
+ default: 8
+ -d --debug Turn on debug logging
+ default: False
+
+ Sample Configuration file
+
+ { 'debug': True,
+ 'logpath': '.'
+ }
+
+ Authors:
+ bear Mike Taylor <bear@mozilla.com>
+"""
+
+import os, sys
+import time
+import json
+import logging
+import smtplib
+import email.utils
+
+from email.mime.text import MIMEText
+from datetime import date, datetime, timedelta
+
+from releng import initOptions, initLogs, dbRedis
+
+log = logging.getLogger()
+
+
+# build:mozilla-inbound-android-debug:b7de9902160746b3afaa3496b55ec8f3
+ # {'product': 'mobile',
+ # 'slave': 'linux-ix-slave36',
+ # 'branch': 'mozilla-inbound',
+ # 'started': '2012-03-14T15:02:11+01:00',
+ # 'platform': 'android-debug',
+ # 'master': 'buildbot-master25',
+ # 'scheduler': 'mozilla-inbound',
+ # 'builduid': 'b7de9902160746b3afaa3496b55ec8f3',
+ # 'revision': 'cb66ae517284fae3162cedeb78a3687fbfa0173d'}
+# build:ux_fedora_test-crashtest:8f190cb4a8974adcb2f40e2b5352d7e9
+ # {'product': 'firefox',
+ # 'build_url': 'http://stage.mozilla.org/pub/mozilla.org/firefox/tinderbox-builds/ux-linux/1331759247/firefox-14.0a1.en-US.linux-i686.tar.bz2',
+ # 'slave': 'talos-r3-fed-024',
+ # 'branch': 'ux',
+ # 'started': '2012-03-14T15:01:49+01:00',
+ # 'pgo_build': 'False',
+ # 'who': 'sendchange-unittest',
+ # 'elapsed': '286',
+ # 'platform': 'linux',
+ # 'finished': '2012-03-14T15:06:35+01:00',
+ # 'master': 'buildbot-master17',
+ # 'scheduler': 'tests-ux-fedora-opt-unittest',
+ # 'builduid': '8f190cb4a8974adcb2f40e2b5352d7e9',
+ # 'revision': 'f55dc14475ff'}
+
+def gatherData(db, dToday, dHour):
+ alerts = []
+ dashboard = {}
+ kittens = {}
+ builds = {}
+ jobs = {}
+
+ print 'build:%s.%s' % (dToday, dHour)
+
+ dashboard['jobs'] = 0
+
+ for key in db.smembers('build:%s.%s' % (dToday, dHour)):
+ for jobKey in db.smembers(key):
+ build = db.hgetall(jobKey)
+ builds[jobKey] = build
+
+ builduid = build['builduid']
+ kitten = build['slave']
+
+ if kitten not in kittens:
+ kittens[kitten] = { 'revisions': [],
+ 'jobs': [],
+ 'elapsed': [],
+ 'starts': 0,
+ 'finishes': 0,
+ }
+
+ if 'started' in build:
+ kittens[kitten]['starts'] += 1
+ if 'finished' in build:
+ kittens[kitten]['finishes'] += 1
+ if 'elapsed' in build:
+ kittens[kitten]['elapsed'].append(build['elapsed'])
+ if 'revision' in build:
+ kittens[kitten]['revisions'].append(build['revision'])
+
+ kittens[kitten]['jobs'].append(jobKey)
+
+ if builduid not in builds:
+ builds[builduid] = { 'kittens': [],
+ 'started': None,
+ 'finished': None,
+ }
+
+ builds[builduid]['kittens'].append(build['slave'])
+
+ if 'started' in build:
+ builds[builduid]['started'] = build['started']
+ if 'finished' in build:
+ builds[builduid]['finished'] = build['finished']
+
+ dashboard['jobs'] += 1
+
+ dKey = 'dashboard:%s.%s' % (dToday, dHour)
+
+ dashboard['kittens'] = len(kittens.keys())
+ dashboard['maxStarts'] = 0
+ dashboard['maxStartsKitten'] = ''
+ dashboard['maxFinishes'] = 0
+ dashboard['maxFinishesKitten'] = ''
+ dashboard['minElapsed'] = 99999
+ dashboard['minElapsedKitten'] = ''
+ dashboard['meanElapsed'] = 0
+ dashboard['maxElapsed'] = 0
+ dashboard['maxElapsedKitten'] = ''
+
+ for host in kittens:
+ kitten = kittens[host]
+
+ if kitten['starts'] > 20:
+ alerts.append((host, 'starts', kitten['starts']))
+
+ if kitten['starts'] > dashboard['maxStarts']:
+ dashboard['maxStarts'] = kitten['starts']
+ dashboard['maxStartsKitten'] = host
+
+ if kitten['finishes'] > dashboard['maxFinishes']:
+ dashboard['maxFinishes'] = kitten['finishes']
+ dashboard['maxFinishesKitten'] = host
+
+ totalElapsed = 0
+ nElapsed = 0
+ for e in kitten['elapsed']:
+ try:
+ elapsed = int(e)
+ except:
+ elapsed = 0
+ totalElapsed += elapsed
+ nElapsed += 1
+
+ if elapsed > dashboard['maxElapsed']:
+ dashboard['maxElapsed'] = elapsed
+ dashboard['maxElapsedKitten'] = host
+ if elapsed < dashboard['minElapsed']:
+ dashboard['minElapsed'] = elapsed
+ dashboard['minElapsedKitten'] = host
+ if nElapsed > 0:
+ dashboard['meanElapsed'] = totalElapsed / nElapsed
+ else:
+ dashboard['meanElapsed'] = 0
+
+ print dKey, dashboard
+
+ if dashboard['jobs'] > 0:
+ for key in dashboard:
+ db.hset(dKey, key, dashboard[key])
+
+ return alerts
+
+
+def sendAlertEmail(alerts, options):
+ body = '\r\nThe following alerts have been triggered during dashboard monitoring:'
+
+ for host, alert, value in alerts:
+ body += '\r\n\r\n%s %s %s this hour' % (host, alert, value)
+
+ log.info('Sending alert email')
+ log.debug(body)
+
+ addr = 'release@mozilla.com'
+ msg = MIMEText(body)
+
+ msg.set_unixfrom('briarpatch')
+ msg['To'] = email.utils.formataddr(('RelEng', addr))
+ msg['From'] = email.utils.formataddr(('briarpatch', addr))
+ msg['Subject'] = '[briar-patch] monitor alert'
+
+ server = smtplib.SMTP('localhost')
+ server.set_debuglevel(options.debug)
+ server.sendmail(addr, [addr], msg.as_string())
+ server.quit()
+
+
+
+_defaultOptions = { 'config': ('-c', '--config', None, 'Configuration file'),
+ 'debug': ('-d', '--debug', True, 'Enable Debug', 'b'),
+ 'logpath': ('-l', '--logpath', None, 'Path where log file is to be written'),
+ 'redis': ('-r', '--redis', 'localhost:6379', 'Redis connection string'),
+ 'redisdb': ('', '--redisdb', '8', 'Redis database'),
+ 'email': ('-e', '--email', False, 'send result email', 'b'),
+ }
+
+if __name__ == '__main__':
+ options = initOptions(_defaultOptions)
+ initLogs(options)
+
+ log.info('Starting')
+
+ db = dbRedis(options)
+
+ tdHour = timedelta(hours=-1)
+ dGather = datetime.now()
+
+ for i in range(0, 3):
+ alerts = gatherData(db, dGather.strftime('%Y-%m-%d'), dGather.strftime('%H'))
+ dGather = dGather + tdHour
+
+ if i == 0 and len(alerts) > 0 and options.email:
+ sendAlertEmail(alerts, options)
+
+ log.info('done')
+
View
3 releng/__init__.py
@@ -89,6 +89,9 @@ def __init__(self, options):
def ping(self):
return self._redis.ping()
+ def exists(self, key):
+ return self._redis.exists(key)
+
def lrange(self, listName, start, end):
return self._redis.lrange(listName, start, end)
View
1 releng/remote.py
@@ -201,7 +201,6 @@ def wait(self):
self.channel.sendall("\r\n")
data = self._read()
buf.append(data)
- print self.prompt, data
if data.endswith(self.prompt) and not self.channel.recv_ready():
break
time.sleep(1)

0 comments on commit fe668e6

Please sign in to comment.
Something went wrong with that request. Please try again.