Skip to content

Commit

Permalink
Fix: capture os major + update UA regexes (#69)
Browse files Browse the repository at this point in the history
* Test: drop (confusing) insist expectations

* Refactor: dry-out string dup-ing

* Test: add some UTF-8 string asserts

* Refactor: and test behavior on raised errors

* Refactor: move field name setup to initialize

* Fix: remove unused (never filled) event field

* Fix: detect 'major' OS version (even if no minor)
this is useful to detect Windows majors: 'Vista', '10'

* Fix: update to latest UA regexes for accurate matching

* Fix: [bracketed] target field configuration

* Fix: work-around Mac OS version matching regression
  • Loading branch information
kares committed Mar 24, 2021
1 parent 495e695 commit 3d7221e
Show file tree
Hide file tree
Showing 7 changed files with 252 additions and 99 deletions.
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
## 3.3.0 (pending)
- Fix: capture os major version + update UA regexes [#69](https://github.com/logstash-plugins/logstash-filter-useragent/pull/69)

The UA parser *regexes.yaml* update (to **v0.12.0**) will accurately detect recent user agent strings.

NOTE: The update might cause changes in matching user agent fields such as `name`
(for example, the previous version did not support `Edge` and detect it as `Chrome`).
If needed the old behavior can be restored by downloading the outdated [regexes.yaml](https://raw.githubusercontent.com/ua-parser/uap-core/2e6c983e42e7aae7d957a263cb4d3de7ccbd92af/regexes.yaml)
and configuring `regexes => path/to/regexes.yaml`.

- Plugin no longer sets the `[build]` UA version field which is not implemented and was always `""`.
- Fix: `target => [field]` configuration, which wasn't working previously

## 3.2.4
- Added support for OS regular expressions that use backreferences [#59](https://github.com/logstash-plugins/logstash-filter-useragent/pull/59)

Expand Down
21 changes: 17 additions & 4 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,23 @@ repositories {
mavenCentral()
}

String yamlResourceRoot = 'https://raw.githubusercontent.com/ua-parser/uap-core/2e6c983e42e7aae7d957a263cb4d3de7ccbd92af'
String yamlResourceRoot = 'https://raw.githubusercontent.com/ua-parser/uap-core/v0.12.0'
def yamlTempDir = File.createTempDir()
def yamlTempFile = yamlTempDir.toPath().resolve('regexes.yaml')
task downloadYaml(type: Download, overwrite: false) {
src yamlResourceRoot + '/regexes.yaml'
dest buildDir.toPath().resolve('resources/main/regexes.yaml').toFile()
dest yamlTempFile.toFile()
}

task patchYaml(type: Copy, dependsOn: [downloadYaml]) {
from yamlTempFile
into buildDir.toPath().resolve('resources/main')
filter { line ->
// work-around a 'regression' with extracting Mac OS version, without the '?'
// the pattern would extract major: '18', minor: '2' from agent strings like:
// "MacOutlook/16.24.0.190414 (Intelx64 Mac OS X Version 10.14.4 (Build 18E226))"
line.replace("- regex: 'Mac OS X\\s.{1,50}\\s(\\d+).(\\d+).(\\d+)'", "- regex: 'Mac OS X\\s.{1,50}?\\s(\\d+).(\\d+).(\\d+)'")
}
}

task downloadTestYaml(type: Download, overwrite: false) {
Expand All @@ -71,10 +84,10 @@ task downloadTestYaml(type: Download, overwrite: false) {
dest buildDir.toPath().resolve('resources/test').toFile()
}

task verifyYaml(type: Verify, dependsOn: [downloadYaml, downloadTestYaml]) {
task verifyYaml(type: Verify, dependsOn: [patchYaml, downloadTestYaml]) {
src buildDir.toPath().resolve('resources/main/regexes.yaml').toFile()
algorithm 'SHA1'
checksum '21d1f46ef68fc5b2dc7f20cc7b6bc5af63b5f55d'
checksum '5a8ea18a9c9153e83159b8662e3f6650fbca60a8' // after replacement
}

dependencies {
Expand Down
91 changes: 46 additions & 45 deletions lib/logstash/filters/useragent.rb
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
# encoding: utf-8
require "java"
require "logstash-filter-useragent_jars"
require "logstash/filters/base"
require "logstash/namespace"
require "tempfile"
require "thread"

# Parse user agent strings into structured data based on BrowserScope data
Expand Down Expand Up @@ -55,29 +53,32 @@ class LogStash::Filters::UserAgent < LogStash::Filters::Base
# number of cache misses and waste memory.
config :lru_cache_size, :validate => :number, :default => 100_000

def register
def initialize(*params)
super

# make @target in the format [field name] if defined, i.e. surrounded by brackets
target = @target || ''
target = "[#{@target}]" if !target.empty? && target !~ /^\[[^\[\]]+\]$/

# predefine prefixed field names
@prefixed_name = "#{target}[#{@prefix}name]"
@prefixed_os = "#{target}[#{@prefix}os]"
@prefixed_os_name = "#{target}[#{@prefix}os_name]"
@prefixed_os_major = "#{target}[#{@prefix}os_major]"
@prefixed_os_minor = "#{target}[#{@prefix}os_minor]"
@prefixed_device = "#{target}[#{@prefix}device]"
@prefixed_major = "#{target}[#{@prefix}major]"
@prefixed_minor = "#{target}[#{@prefix}minor]"
@prefixed_patch = "#{target}[#{@prefix}patch]"
end

def register
if @regexes.nil?
@parser = org.logstash.uaparser.CachingParser.new(lru_cache_size)
else
@logger.debug("Using user agent regexes", :regexes => @regexes)
@parser = org.logstash.uaparser.CachingParser.new(@regexes, lru_cache_size)
end

# make @target in the format [field name] if defined, i.e. surrounded by brakets
normalized_target = (@target && @target !~ /^\[[^\[\]]+\]$/) ? "[#{@target}]" : ""

# predefine prefixed field names
@prefixed_name = "#{normalized_target}[#{@prefix}name]"
@prefixed_os = "#{normalized_target}[#{@prefix}os]"
@prefixed_os_name = "#{normalized_target}[#{@prefix}os_name]"
@prefixed_os_major = "#{normalized_target}[#{@prefix}os_major]"
@prefixed_os_minor = "#{normalized_target}[#{@prefix}os_minor]"
@prefixed_device = "#{normalized_target}[#{@prefix}device]"
@prefixed_major = "#{normalized_target}[#{@prefix}major]"
@prefixed_minor = "#{normalized_target}[#{@prefix}minor]"
@prefixed_patch = "#{normalized_target}[#{@prefix}patch]"
@prefixed_build = "#{normalized_target}[#{@prefix}build]"
end

def filter(event)
Expand All @@ -88,8 +89,10 @@ def filter(event)

begin
ua_data = lookup_useragent(useragent)
rescue StandardError => e
@logger.error("Uknown error while parsing user agent data", :exception => e, :field => @source, :event => event)
rescue => e
@logger.error("Unknown error while parsing user agent data",
:exception => e.class, :message => e.message, :backtrace => e.backtrace,
:field => @source, :event => event.to_hash)
return
end

Expand All @@ -101,47 +104,45 @@ def filter(event)
filter_matched(event)
end

# should be private but need to stay public for specs
# TODO: (colin) the related specs should be refactored to not rely on private methods.
def lookup_useragent(useragent)
return unless useragent
private

def lookup_useragent(useragent)
# the UserAgentParser::Parser class is not thread safe, indications are that it is probably
# caused by the underlying JRuby regex code that is not thread safe.
# see https://github.com/logstash-plugins/logstash-filter-useragent/issues/25
@parser.parse(useragent)
end

private

def set_fields(event, ua_data)
# UserAgentParser outputs as US-ASCII.

event.set(@prefixed_name, ua_data.userAgent.family.dup.force_encoding(Encoding::UTF_8))

#OSX, Android and maybe iOS parse correctly, ua-agent parsing for Windows does not provide this level of detail
event.set(@prefixed_name, duped_string(ua_data.userAgent.family))
event.set(@prefixed_device, duped_string(ua_data.device)) if ua_data.device

# Calls in here use #dup because there's potential for later filters to modify these values
# and corrupt the cache. See uap source here for details https://github.com/ua-parser/uap-ruby/tree/master/lib/user_agent_parser
if (os = ua_data.os)
os = ua_data.os
if os
# The OS is a rich object
event.set(@prefixed_os, ua_data.os.family.dup.force_encoding(Encoding::UTF_8))
event.set(@prefixed_os_name, os.family.dup.force_encoding(Encoding::UTF_8)) if os.family
event.set(@prefixed_os, duped_string(os.family))
event.set(@prefixed_os_name, duped_string(os.family)) if os.family

# These are all strings
if os.minor && os.major
event.set(@prefixed_os_major, os.major.dup.force_encoding(Encoding::UTF_8)) if os.major
event.set(@prefixed_os_minor, os.minor.dup.force_encoding(Encoding::UTF_8)) if os.minor
end
major, minor = os.major, os.minor
event.set(@prefixed_os_major, duped_string(major)) if major # e.g. 'Vista' or '10'
event.set(@prefixed_os_minor, duped_string(minor)) if minor
end

event.set(@prefixed_device, ua_data.device.to_s.dup.force_encoding(Encoding::UTF_8)) if ua_data.device

if (ua_version = ua_data.userAgent)
event.set(@prefixed_major, ua_version.major.dup.force_encoding(Encoding::UTF_8)) if ua_version.major
event.set(@prefixed_minor, ua_version.minor.dup.force_encoding(Encoding::UTF_8)) if ua_version.minor
event.set(@prefixed_patch, ua_version.patch.dup.force_encoding(Encoding::UTF_8)) if ua_version.patch
event.set(@prefixed_build, ua_version.patchMinor.dup.force_encoding(Encoding::UTF_8)) if ua_version.patchMinor
ua_version = ua_data.userAgent
if ua_version
event.set(@prefixed_major, duped_string(ua_version.major)) if ua_version.major
event.set(@prefixed_minor, duped_string(ua_version.minor)) if ua_version.minor
event.set(@prefixed_patch, duped_string(ua_version.patch)) if ua_version.patch
end
end

def duped_string(str)
# Calls in here use #dup because there's potential for later filters to modify these values
# and corrupt the cache. See uap source here for details https://github.com/ua-parser/uap-ruby/tree/master/lib/user_agent_parser
str.dup.force_encoding(Encoding::UTF_8)
end

end
1 change: 0 additions & 1 deletion logstash-filter-useragent.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,5 @@ Gem::Specification.new do |s|
# Gem dependencies
s.add_runtime_dependency "logstash-core-plugin-api", ">= 1.60", "<= 2.99"
s.add_development_dependency 'logstash-devutils'
s.add_development_dependency 'insist'
end

0 comments on commit 3d7221e

Please sign in to comment.