diff --git a/recommendation/factory.py b/recommendation/factory.py index 236b67c..4e262fc 100644 --- a/recommendation/factory.py +++ b/recommendation/factory.py @@ -1,8 +1,13 @@ +from logging import INFO, StreamHandler +from sys import stdout + from celery import Celery from flask import Flask from recommendation import conf from recommendation.cors import cors_headers +from recommendation.mozlog.formatter import MozLogFormatter +from recommendation.mozlog.middleware import request_timer, request_summary from recommendation.views.dummy import dummy from recommendation.views.main import main from recommendation.views.static import static @@ -11,7 +16,11 @@ def create_app(): app = Flask(__name__) + + # Append CORS headers to each request. app.after_request(cors_headers) + + # Register views. app.register_blueprint(main) app.register_blueprint(static) app.register_blueprint(status) @@ -19,6 +28,18 @@ def create_app(): # Use a dummy data generator while Yahoo BOSS access is being sorted out. app.register_blueprint(dummy) + # Log using the mozlog format to stdout. + handler = StreamHandler(stream=stdout) + handler.setFormatter(MozLogFormatter(logger_name='universalSearch')) + handler.setLevel(INFO) + app.logger_name = 'request.summary' + app.logger.addHandler(handler) + app.logger.setLevel(INFO) + + # Use logging middleware. + app.before_request(request_timer) + app.after_request(request_summary) + app.config.update( CELERY_BROKER_URL=conf.CELERY_BROKER_URL, DEBUG=conf.DEBUG diff --git a/recommendation/mozlog/__init__.py b/recommendation/mozlog/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/recommendation/mozlog/formatter.py b/recommendation/mozlog/formatter.py new file mode 100644 index 0000000..3cc75ef --- /dev/null +++ b/recommendation/mozlog/formatter.py @@ -0,0 +1,109 @@ +""" +Copied from +https://github.com/mozilla/testpilot/blob/master/testpilot/base/logging.py +""" +import logging +import json +import socket +import traceback + + +class MozLogFormatter(logging.Formatter): + """Log formatter that outputs machine-readable json. + + This log formatter outputs JSON format messages that are compatible with + Mozilla's standard heka-based log aggregation infrastructure. + + See also: + https://mana.mozilla.org/wiki/display/CLOUDSERVICES/Logging+Standard + https://mana.mozilla.org/wiki/pages/viewpage.action?pageId=42895640 + + Adapted from: + https://github.com/mozilla-services/mozservices/blob/master/mozsvc/util.py#L106 + """ + LOGGING_FORMAT_VERSION = "2.0" + + # Map from Python logging to Syslog severity levels + SYSLOG_LEVEL_MAP = { + 50: 2, # CRITICAL + 40: 3, # ERROR + 30: 4, # WARNING + 20: 6, # INFO + 10: 7, # DEBUG + } + + # Syslog level to use when/if python level isn't found in map + DEFAULT_SYSLOG_LEVEL = 7 + + EXCLUDED_LOGRECORD_ATTRS = set(( + 'args', 'asctime', 'created', 'exc_info', 'exc_text', 'filename', + 'funcName', 'levelname', 'levelno', 'lineno', 'module', 'msecs', + 'message', 'msg', 'name', 'pathname', 'process', 'processName', + 'relativeCreated', 'stack_info', 'thread', 'threadName' + )) + + def __init__(self, format=None, datefmt=None, style='%', + logger_name='TestPilot'): + super(MozLogFormatter, self).__init__(format, datefmt, style) + self.logger_name = logger_name + self.hostname = socket.gethostname() + + def format(self, record): + """ + Map from Python LogRecord attributes to JSON log format fields + + * from - + https://docs.python.org/3/library/logging.html#logrecord-attributes # + noqa + * to - + https://mana.mozilla.org/wiki/pages/viewpage.action?pageId=42895640 # + noqa + """ + out = dict( + Timestamp=int(record.created * 1e9), + Type=record.name, + Logger=self.logger_name, + Hostname=self.hostname, + EnvVersion=self.LOGGING_FORMAT_VERSION, + Severity=self.SYSLOG_LEVEL_MAP.get(record.levelno, + self.DEFAULT_SYSLOG_LEVEL), + Pid=record.process, + ) + + # Include any custom attributes set on the record. + # These would usually be collected metrics data. + fields = dict() + for key, value in record.__dict__.items(): + if key not in self.EXCLUDED_LOGRECORD_ATTRS: + fields[key] = value + + # Only include the 'message' key if it has useful content + # and is not already a JSON blob. + message = record.getMessage() + if message: + if not message.startswith("{") and not message.endswith("}"): + fields["message"] = message + + # If there is an error, format it for nice output. + if record.exc_info is not None: + fields["error"] = repr(record.exc_info[1]) + fields["traceback"] = safer_format_traceback(*record.exc_info) + + out['Fields'] = fields + + return json.dumps(out) + + +def safer_format_traceback(exc_typ, exc_val, exc_tb): + """Format an exception traceback into safer string. + We don't want to let users write arbitrary data into our logfiles, + which could happen if they e.g. managed to trigger a ValueError with + a carefully-crafted payload. This function formats the traceback + using "%r" for the actual exception data, which passes it through repr() + so that any special chars are safely escaped. + """ + lines = ["Uncaught exception:\n"] + lines.extend(traceback.format_tb(exc_tb)) + lines.append("%r\n" % (exc_typ,)) + lines.append("%r\n" % (exc_val,)) + return "".join(lines) diff --git a/recommendation/mozlog/middleware.py b/recommendation/mozlog/middleware.py new file mode 100644 index 0000000..9fded67 --- /dev/null +++ b/recommendation/mozlog/middleware.py @@ -0,0 +1,65 @@ +import json +import re +import time + +from flask import current_app, request + + +IS_PROTOCOL = r'^[^\s]+\:\S' +IS_HOSTNAME = r'^[^\s]+\.\S' + + +def request_timer(): + """ + before_request middleware that attaches the processing start time to the + request object, for later performance assessment. + """ + request.start_time = time.time() + + +def request_summary(response): + """ + after_request middleware that generates and logs a mozlog-formatted log + about the request. + + Read more: + + https://github.com/mozilla/universal-search/blob/master/docs/metrics.md + https://github.com/mozilla-services/Dockerflow/blob/master/docs/mozlog.md + """ + request.finish_time = time.time() + response.direct_passthrough = False + + log = { + 'predicates': {} + } + query = request.args.get('q') + data = response.get_data(as_text=True) + try: + body = json.loads(data) + except json.decoder.JSONDecodeError: + body = {} + + log['agent'] = request.headers.get('User-Agent') + log['errno'] = 0 if response.status_code < 400 else response.status_code + log['lang'] = request.headers.get('Accept-Language') + log['method'] = request.method + log['path'] = request.path + log['t'] = request.finish_time - request.start_time + + if query: + log['predicates']['query_length'] = len(query) > 20 + log['predicates']['is_protocol'] = (re.match(IS_PROTOCOL, query) is not + None) + log['predicates']['is_hostname'] = (re.match(IS_HOSTNAME, query) is not + None) + + if not any(log['predicates'].values()): + log['query'] = query if query else None + log['status_code'] = response.status_code + classifiers = body.get('enhancements') + log['classifiers'] = list(classifiers.keys()) if classifiers else [] + + current_app.logger.info('', extra=log) + + return response diff --git a/recommendation/mozlog/tests/__init__.py b/recommendation/mozlog/tests/__init__.py new file mode 100644 index 0000000..1cf6267 --- /dev/null +++ b/recommendation/mozlog/tests/__init__.py @@ -0,0 +1 @@ +VERSION = "0.1.0" diff --git a/recommendation/mozlog/tests/test_formatter.py b/recommendation/mozlog/tests/test_formatter.py new file mode 100644 index 0000000..bfe68f0 --- /dev/null +++ b/recommendation/mozlog/tests/test_formatter.py @@ -0,0 +1,168 @@ +""" +Copied from +https://github.com/mozilla/testpilot/blob/master/testpilot/base/tests.py +""" +import json +import logging +import os +from unittest import TestCase + +from testfixtures import LogCapture +import jsonschema + +from recommendation.mozlog.formatter import MozLogFormatter + +logger = logging.getLogger(__name__) + + +class TestMozLogFormatter(TestCase): + + def setUp(self): + self.handler = LogCapture() + self.logger_name = "TestingTestPilot" + self.formatter = MozLogFormatter(logger_name=self.logger_name) + + def tearDown(self): + self.handler.uninstall() + + def _fetchLastLog(self): + self.assertEquals(len(self.handler.records), 1) + details = json.loads(self.formatter.format(self.handler.records[0])) + jsonschema.validate(details, JSON_LOGGING_SCHEMA) + return details + + def test_basic_operation(self): + """Ensure log formatter contains all the expected fields and values""" + message_text = "simple test" + logging.debug(message_text) + details = self._fetchLastLog() + + expected_present = ["Timestamp", "Hostname"] + for key in expected_present: + self.assertTrue(key in details) + + expected_meta = { + "Severity": 7, + "Type": "root", + "Pid": os.getpid(), + "Logger": self.logger_name, + "EnvVersion": self.formatter.LOGGING_FORMAT_VERSION + } + for key, value in expected_meta.items(): + self.assertEquals(value, details[key]) + + self.assertEquals(details['Fields']['message'], message_text) + + def test_custom_paramters(self): + """Ensure log formatter can handle custom parameters""" + logger = logging.getLogger("mozsvc.test.test_logging") + logger.warn("custom test %s", "one", extra={"more": "stuff"}) + details = self._fetchLastLog() + + self.assertEquals(details["Type"], "mozsvc.test.test_logging") + self.assertEquals(details["Severity"], 4) + + fields = details['Fields'] + self.assertEquals(fields["message"], "custom test one") + self.assertEquals(fields["more"], "stuff") + + def test_logging_error_tracebacks(self): + """Ensure log formatter includes exception traceback information""" + try: + raise ValueError("\n") + except Exception: + logging.exception("there was an error") + details = self._fetchLastLog() + + expected_meta = { + "Severity": 3, + } + for key, value in expected_meta.items(): + self.assertEquals(value, details[key]) + + fields = details['Fields'] + expected_fields = { + 'message': 'there was an error', + 'error': "ValueError('\\n',)" + } + for key, value in expected_fields.items(): + self.assertEquals(value, fields[key]) + + self.assertTrue(fields['traceback'].startswith('Uncaught exception:')) + self.assertTrue("" in fields['traceback']) + + +# https://mana.mozilla.org/wiki/pages/viewpage.action?pageId=42895640 +JSON_LOGGING_SCHEMA = json.loads(""" +{ + "type":"object", + "required":["Timestamp"], + "properties":{ + "Timestamp":{ + "type":"integer", + "minimum":0 + }, + "Type":{ + "type":"string" + }, + "Logger":{ + "type":"string" + }, + "Hostname":{ + "type":"string", + "format":"hostname" + }, + "EnvVersion":{ + "type":"string", + "pattern":"^\\d+(?:\\.\\d+){0,2}$" + }, + "Severity":{ + "type":"integer", + "minimum":0, + "maximum":7 + }, + "Pid":{ + "type":"integer", + "minimum":0 + }, + "Fields":{ + "type":"object", + "minProperties":1, + "additionalProperties":{ + "anyOf": [ + { "$ref": "#/definitions/field_value"}, + { "$ref": "#/definitions/field_array"}, + { "$ref": "#/definitions/field_object"} + ] + } + } + }, + "definitions":{ + "field_value":{ + "type":["string", "number", "boolean"] + }, + "field_array":{ + "type":"array", + "minItems": 1, + "oneOf": [ + {"items": {"type":"string"}}, + {"items": {"type":"number"}}, + {"items": {"type":"boolean"}} + ] + }, + "field_object":{ + "type":"object", + "required":["value"], + "properties":{ + "value":{ + "oneOf": [ + { "$ref": "#/definitions/field_value" }, + { "$ref": "#/definitions/field_array" } + ] + }, + "representation":{"type":"string"} + } + } + } +} +""".replace("\\", "\\\\")) # HACK: Fix escaping for easy copy/paste diff --git a/recommendation/mozlog/tests/test_middleware.py b/recommendation/mozlog/tests/test_middleware.py new file mode 100644 index 0000000..b1300d5 --- /dev/null +++ b/recommendation/mozlog/tests/test_middleware.py @@ -0,0 +1,135 @@ +import json +from time import time + +from mock import MagicMock, patch +from nose.tools import eq_, ok_ + +from recommendation.mozlog.middleware import request_summary, request_timer +from recommendation.tests.util import AppTestCase + + +WITH_CLASSIFIERS = { + 'enhancements': { + 'hello': '', + 'frend': '' + } +} +DUMMY = 'Hello, frend.' +REQUEST_PATH = 'recommendation.mozlog.middleware.request' + + +class TestMozLogMiddleware(AppTestCase): + def _response(self, data=None, status_code=200, **kwargs): + response = MagicMock() + response.get_data.return_value = json.dumps(data or {}) + response.status_code = status_code + return response + + def _request(self, args='hello', headers='world', path='/', + method='GET'): + request = MagicMock() + request.args.get.return_value = args + request.headers.get.return_value = headers + request.method = method + request.path = path + request.start_time = time() + return request + + def _request_summary(self, request_args=None, response_args=None): + request_args = request_args or {} + response_args = response_args or {} + with patch('recommendation.mozlog.middleware.current_app') as app: + mock_request = self._request(**request_args) + mock_response = self._response(**response_args) + with patch(REQUEST_PATH, mock_request): + request_summary(mock_response) + return (app.logger.info.call_args[1]['extra'], mock_request, + mock_response) + + def _query(self, query): + return self._request_summary(request_args={ + 'args': query + }) + + def _predicates(self, query): + return self._query(query)[0]['predicates'] + + @patch(REQUEST_PATH) + def test_request_timer(self, mock_request): + request_timer() + ok_(type(mock_request.start_time), float) + + def test_request_summary_headers(self): + output, req, res = self._request_summary(request_args={ + 'headers': DUMMY + }) + eq_(output['agent'], DUMMY) + eq_(output['lang'], DUMMY) + + def test_request_summary_errno_400(self): + output, req, res = self._request_summary(response_args={ + 'status_code': 400 + }) + eq_(output['errno'], 400) + + def test_request_summary_errno_200(self): + output, req, res = self._request_summary(response_args={ + 'status_code': 200 + }) + eq_(output['errno'], 0) + + def test_request_summary_method(self): + output, req, res = self._request_summary(request_args={ + 'method': DUMMY + }) + eq_(output['method'], DUMMY) + + def test_request_summary_path(self): + output, req, res = self._request_summary(request_args={ + 'path': DUMMY + }) + eq_(output['path'], DUMMY) + + def test_request_summary_t(self): + output, req, res = self._request_summary() + eq_(output['t'], req.finish_time - req.start_time) + + def test_request_summary_predicates_ok(self): + ok_(not any(self._predicates('the mart').values())) + + def test_request_summary_query_too_long(self): + ok_(self._predicates('q' * 25)['query_length']) + + def test_request_summary_query_is_protocol(self): + ok_(self._predicates('http://')['is_protocol']) + ok_(self._predicates('https://')['is_protocol']) + ok_(self._predicates('file://')['is_protocol']) + ok_(not self._predicates('hello: //')['is_protocol']) + ok_(not self._predicates('hello ://')['is_protocol']) + + def test_request_summary_query_is_hostname(self): + ok_(self._predicates('www.m')['is_hostname']) + ok_(self._predicates('thisislong.but')['is_hostname']) + ok_(not self._predicates('hello. friend')['is_hostname']) + + def test_request_summary_query(self): + eq_(self._query(DUMMY)[0]['query'], DUMMY) + + def test_request_summary_status_code_200(self): + output, req, res = self._request_summary(response_args={ + 'status_code': 200 + }) + eq_(output['status_code'], 200) + + def test_request_summary_status_code_400(self): + output, req, res = self._request_summary(response_args={ + 'status_code': 400 + }) + eq_(output['status_code'], 400) + + def test_request_summary_classifiers(self): + output, req, res = self._request_summary(response_args={ + 'data': WITH_CLASSIFIERS + }) + eq_(output['classifiers'], + list(WITH_CLASSIFIERS['enhancements'].keys())) diff --git a/requirements.txt b/requirements.txt index 1cff6d3..d16ba2b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ Flask==0.10.1 flask-testing==0.4.2 itsdangerous==0.24 Jinja2==2.8 +jsonschema==2.5.1 MarkupSafe==0.23 mock==1.3.0 nose==1.3.7 @@ -16,6 +17,7 @@ pylibmc==1.5.1 redis==2.10.5 requests==2.9.1 responses==0.5.1 +testfixtures==4.9.1 uwsgi==2.0.12 Werkzeug==0.11.5 wheel==0.29.0