Permalink
Browse files

Parse and process Windows logs from S3

  • Loading branch information...
1 parent f2fbe8c commit 632571b3f7b7967a632cf57fe03fb8e698945415 @andreyvit andreyvit committed Apr 9, 2013
Showing with 249 additions and 2 deletions.
  1. +4 −0 get-s3.sh
  2. +14 −0 lib/datafiles.coffee
  3. +7 −0 lib/granularities.coffee
  4. +35 −0 lib/processing/s3-to-raw.coffee
  5. +1 −0 lib/rawentries.coffee
  6. +174 −0 lib/s3.coffee
  7. +2 −1 package.json
  8. +4 −0 publish-win.sh
  9. +8 −1 run.sh
View
@@ -0,0 +1,4 @@
+#!/bin/bash
+mkdir -p data/s3
+echo "S3CMD data/s3"
+s3cmd -v sync s3://livereload-logs/ping/logs/ data/s3/
View
@@ -48,6 +48,19 @@ class ApacheDataFileGroup extends DataFileGroup
parse: (data) -> data.split("\n")
+class S3DataFileGroup extends DataFileGroup
+
+ constructor: (subdir) ->
+ super(subdir, 'subday', 0, '')
+ @regexp = /^access_log-/
+
+ idToFileName: (id) -> throw new Error "Unsupported"
+
+ fileNameToId: (name) -> name.replace(/^access_log-/, '').replace(/^(\d\d\d\d)-(\d\d)-(\d\d)-(.*)$/, (_, y, m, d, q) -> "#{y}-#{m}-#{d}-#{q}")
+
+ parse: (data) -> data.split("\n")
+
+
class DataFile
constructor: (@group, @path, @id) ->
@@ -76,6 +89,7 @@ class DataFile
exports.DataFileGroups = DataFileGroups =
apache: new ApacheDataFileGroup('apache')
+ s3: new S3DataFileGroup('s3')
raw: new DataFileGroup('raw', 'day', 0, '')
rawxx: new DataFileGroup('rawxx', 'day', 0, '')
html: new DataFileGroup('html', 'none', 0, '')
@@ -3,6 +3,7 @@ DAY_FORMAT = '{yyyy}-{MM}-{dd}'
MONTH_FORMAT = '{yyyy}-{MM}'
YEAR_FORMAT = '{yyyy}'
+SUBDAY_REGEXP = /^(((\d{4})-\d{2})-\d{2})-.*$/
DAY_REGEXP = /^((\d{4})-\d{2})-\d{2}$/
MONTH_REGEXP = /^(\d{4})-\d{2}$/
@@ -56,6 +57,12 @@ class Granularity
module.exports = G =
+ subday:
+ Object.merge new Granularity('subday'),
+ outerday: (period) -> period.replace(SUBDAY_REGEXP, '$1')
+ outermonth: (period) -> period.replace(SUBDAY_REGEXP, '$2')
+ outeryear: (period) -> period.replace(SUBDAY_REGEXP, '$3')
+
day:
Object.merge new Granularity('day'),
outermonth: (period) -> period.replace(DAY_REGEXP, '$1')
@@ -0,0 +1,35 @@
+Hierarchy = require '../hierarchy'
+rawentries = require '../rawentries'
+
+{parseLogLine} = require '../s3'
+
+module.exports = (period, files) ->
+ entries = []
+
+ stats =
+ ok: 0
+ skipped_url: 0
+ skipped_method: 0
+ skipped_code: 0
+ empty: 0
+ invalid: 0
+ malformed: 0
+
+ for file in files
+ for line in file.stats
+ try
+ [status, entry] = parseLogLine(line)
+ catch e
+ console.error "Error while processing:"
+ console.error line
+ console.error e.stack || e.message || e
+ process.exit 1
+ if status is 'ok'
+ entry.date = period.string
+ entries.push entry
+ console.log "Invalid:\n#{line}\n" if status is 'invalid'
+ stats[status]++
+
+ console.log " - ok #{stats.ok}, skipped url:#{stats.skipped_url} method:#{stats.skipped_method} code:#{stats.skipped_code}, invalid #{stats.invalid}, malformed #{stats.malformed}, empty #{stats.empty}"
+
+ return entries
@@ -70,6 +70,7 @@ exports.computeEvents = (entry) ->
if (entry.platform is 'windows') and (entry.iversion.startsWith '0.')
events.push "v:platform:windows"
events.push "v:os:win_any"
+ events.push "v:os:" + guessOperatingSystem(entry.agent)
else if entry.platform is 'windows'
return Hierarchy() # ignore; this is a dummy ping sent by the Mac backend, the native side sends better ones
else if entry.agent isnt '-'
View
@@ -0,0 +1,174 @@
+{sprintf} = require './sprintf'
+
+MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
+
+# "[^"\\]*(?:\\.[^"\\]*)*" is a regexp to match a double-quoted string with escapes
+
+VALUE_RE = ///
+ (?: # either a non-quoted string without spaces
+ [^"\s]
+ [^\s]*
+ | # or a quoted string:
+ " # - an opening quote
+ [^"\\]* # - a run of non-escaped characters
+ (?: # - then zero or more of:
+ \\ . # - escape character (slash) followed by any character
+ [^"\\]* # - another run of non-escaped characters
+ )*
+ " # - finally, a closing quote
+ )
+///.toString().replace(/^\//, '').replace(/\/$/, '')
+
+TIME_RE = /// # e.g. [04/Aug/2006:22:34:02 +0000]
+ \[
+ ( \d+? ) # 1: day (04)
+ /
+ ( \w+? ) # 2: month (Aug)
+ /
+ ( \d+? ) # 3: year (2006)
+ :
+ ( \d+? ) # 4: hour (22)
+ :
+ ( \d+? ) # 5: minute (34)
+ :
+ ( \d+? ) # 6: second (02)
+ \s
+ [+-]\d+? # time zone offset (ignored)
+ \]
+///.toString().replace(/^\//, '').replace(/\/$/, '')
+
+IP_RE = ///
+ (?: # either literal "unknown"
+ unknown
+ | # or IPv4 address
+ \d+ \. \d+ \. \d+ \. \d+ # (four numbers separated by periods)
+ | # or IPv6 address
+ [0-9a-f]* # which is a bunch of hex
+ : # that always contains at least one colon
+ [0-9a-f:]* # but most often looks like a mess of colons and hex digits
+ )
+///.toString().replace(/^\//, '').replace(/\/$/, '')
+
+REGEXP = ///^
+ (?: [0-9a-f]+ | - ) # bucket owner (hex) - 314159b66967d86f0...
+ \s
+ ( #{VALUE_RE} ) # 1: bucket - mybucket
+ \s
+ (?: #{TIME_RE} | - ) # 2,3,4,5,6,7: time - [04/Aug/2006:22:34:02 +0000]
+ \s
+ ( #{IP_RE} | - ) # 8: ip - 72.21.206.5
+ \s
+ #{VALUE_RE} # requester (ignored) - 314159b66967d86f0...
+ \s
+ #{VALUE_RE} # request id (ignored) - 3E57427F33A59F07
+ \s
+ ( #{VALUE_RE} ) # 9: operation - REST.GET.OBJECT, REST.PUT.OBJECT
+ \s
+ ( #{VALUE_RE} ) # 10: key - /photos/2006/08/puppy.jpg
+ \s
+ ( #{VALUE_RE} ) # 11: Request-URI - "GET /mybucket/photos/2006/08/puppy.jpg?x-foo=bar"
+ \s
+ ( - | \d+ ) # 12: HTTP status - 200
+ \s
+ #{VALUE_RE} # error code (ignored) - NoSuchBucket
+ \s
+ (?: - | \d+ ) # bytes sent (ignored) - 2662992
+ \s
+ (?: - | \d+ ) # object size (ignored) - 3462992
+ \s
+ (?: - | \d+ ) # total time, ms (ignored) - 70
+ \s
+ (?: - | \d+ ) # turn-around time, ms (ignored) - 10
+ \s
+ #{VALUE_RE} # HTTP referrer - "http://www.amazon.com/webservices"
+ \s
+ ( #{VALUE_RE} ) # 13: user agent - "curl/7.15.1"
+ \s
+ #{VALUE_RE} # version id (ignored) - 3HL4kqtJvjVBH40Nrjfkd
+
+ (?: $ | \s ) # the spec allows for more fields to be added later, so either end of string or a space
+///
+
+console.log "regexp = " + REGEXP
+
+parseQueryString = (qs) ->
+ params = {}
+ for kv in qs.split('&')
+ if (pos = kv.indexOf('=')) >= 0
+ k = decodeURIComponent kv.substr(0, pos)
+ v = decodeURIComponent kv.substr(pos + 1)
+
+ if k is 'v'
+ params.version = v
+ else if k is 'iv'
+ params.iversion = v
+ else if k.startsWith 'stat.'
+ params.stats ||= {}
+ params.stats[k.replace(/\./g, '_')] = v
+ else
+ params[k] = v
+ return params
+
+unquote = (s) ->
+ if s.match(/^"/) and s.match(/"$/)
+ s.substr(1, s.length - 2).replace(/\\(.)/g, '$1')
+ else
+ s
+
+exports.parseLogLine = (line) ->
+ line = line.trim()
+ return ['empty'] if line.length == 0
+
+ unless match = line.match REGEXP
+ return ['invalid']
+
+ [dummy, bucket, day, monthName, year, hour, min, sec, ip, operation, key, url, code, ua] = match
+
+ bucket = unquote(bucket)
+ ip = unquote(ip)
+ operation = unquote(operation)
+ key = unquote(key)
+ url = unquote(url)
+ ua = unquote(ua)
+
+ if operation isnt 'REST.GET.OBJECT'
+ return ['skipped_method']
+ if code isnt '200'
+ return ['skipped_code']
+
+ unless url.match /^GET\s+/
+ console.log "bad Request-URI: #{url}"
+ return ['malformed']
+ unless url.match /\s+HTTP\/\d\.\d$/
+ console.log "bad Request-URI end: #{url}"
+ return ['malformed']
+ url = url.replace(/^GET\s+/, '').replace(/\s+HTTP\/\d\.\d$/, '')
+
+ try
+ url = decodeURIComponent(url)
+ catch e
+ return ['malformed']
+
+ if !url.startsWith('/news.json?')
+ console.log "bad url: #{url}"
+ return ['skipped_url']
+
+ params = parseQueryString url.replace('/news.json?', '')
+
+ month = MONTHS.indexOf(monthName)
+ if month < 0
+ throw new Error("Unknown month: '#{monthName}'")
+ month += 1
+
+ date = sprintf("%04d-%02d-%02d", parseInt(year, 10), month, parseInt(day, 10))
+ time = Math.round(Date.UTC(parseInt(year, 10), month-1, parseInt(day, 10), parseInt(hour, 10), parseInt(min, 10), parseInt(sec, 10))/1000)
+
+ if ua is '-'
+ ua = ''
+
+ params.date = date
+ params.time = time
+ params.ip = ip
+ params.agent = ua
+
+ return ['ok', params]
View
@@ -7,6 +7,7 @@
"jade": "~0.27.7",
"sugar": "~1.3.7",
"dreamopt": "~0.8.0",
- "pad": "0.0.4"
+ "pad": "0.0.4",
+ "s3-log-parser": "0.0.2"
}
}
View
@@ -0,0 +1,4 @@
+#!/bin/bash
+echo "RSYNC data/html to http://livereload.com/stats/win/"
+rsync -vrz data/html/ andreyvit_livereload@ssh.phx.nearlyfreespeech.net:/home/public/stats/win/
+echo "http://livereload.com/stats/win/"
View
@@ -1,13 +1,20 @@
#! /bin/bash
yearly=false
+source=apache
if test "$1" = "-y"; then
yearly=true
echo "Will run yearly statistics."
shift
fi
+if test "$1" = "--s3"; then
+ source=s3
+ shift
+fi
+
+echo "Source: $source."
-node bin/process.js apache-to-raw apache raw "$@"
+node bin/process.js ${source}-to-raw $source raw "$@"
node bin/process.js rawtodaily raw day-events "$@"
node bin/process.js reduce day-events month-events "$@"

0 comments on commit 632571b

Please sign in to comment.