diff --git a/CHANGELOG.md b/CHANGELOG.md index fe5c97b..26e84e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,16 @@ +## 3.3.0 (pending) + - Fix: capture os major version + update UA regexes [#69](https://github.com/logstash-plugins/logstash-filter-useragent/pull/69) + + The UA parser *regexes.yaml* update (to **v0.12.0**) will accurately detect recent user agent strings. + + NOTE: The update might cause changes in matching user agent fields such as `name` + (for example, the previous version did not support `Edge` and detect it as `Chrome`). + If needed the old behavior can be restored by downloading the outdated [regexes.yaml](https://raw.githubusercontent.com/ua-parser/uap-core/2e6c983e42e7aae7d957a263cb4d3de7ccbd92af/regexes.yaml) + and configuring `regexes => path/to/regexes.yaml`. + + - Plugin no longer sets the `[build]` UA version field which is not implemented and was always `""`. + - Fix: `target => [field]` configuration, which wasn't working previously + ## 3.2.4 - Added support for OS regular expressions that use backreferences [#59](https://github.com/logstash-plugins/logstash-filter-useragent/pull/59) diff --git a/build.gradle b/build.gradle index fd0320a..d72209d 100644 --- a/build.gradle +++ b/build.gradle @@ -51,10 +51,23 @@ repositories { mavenCentral() } -String yamlResourceRoot = 'https://raw.githubusercontent.com/ua-parser/uap-core/2e6c983e42e7aae7d957a263cb4d3de7ccbd92af' +String yamlResourceRoot = 'https://raw.githubusercontent.com/ua-parser/uap-core/v0.12.0' +def yamlTempDir = File.createTempDir() +def yamlTempFile = yamlTempDir.toPath().resolve('regexes.yaml') task downloadYaml(type: Download, overwrite: false) { src yamlResourceRoot + '/regexes.yaml' - dest buildDir.toPath().resolve('resources/main/regexes.yaml').toFile() + dest yamlTempFile.toFile() +} + +task patchYaml(type: Copy, dependsOn: [downloadYaml]) { + from yamlTempFile + into buildDir.toPath().resolve('resources/main') + filter { line -> + // work-around a 'regression' with extracting Mac OS version, without the '?' + // the pattern would extract major: '18', minor: '2' from agent strings like: + // "MacOutlook/16.24.0.190414 (Intelx64 Mac OS X Version 10.14.4 (Build 18E226))" + line.replace("- regex: 'Mac OS X\\s.{1,50}\\s(\\d+).(\\d+).(\\d+)'", "- regex: 'Mac OS X\\s.{1,50}?\\s(\\d+).(\\d+).(\\d+)'") + } } task downloadTestYaml(type: Download, overwrite: false) { @@ -71,10 +84,10 @@ task downloadTestYaml(type: Download, overwrite: false) { dest buildDir.toPath().resolve('resources/test').toFile() } -task verifyYaml(type: Verify, dependsOn: [downloadYaml, downloadTestYaml]) { +task verifyYaml(type: Verify, dependsOn: [patchYaml, downloadTestYaml]) { src buildDir.toPath().resolve('resources/main/regexes.yaml').toFile() algorithm 'SHA1' - checksum '21d1f46ef68fc5b2dc7f20cc7b6bc5af63b5f55d' + checksum '5a8ea18a9c9153e83159b8662e3f6650fbca60a8' // after replacement } dependencies { diff --git a/lib/logstash/filters/useragent.rb b/lib/logstash/filters/useragent.rb index eefbb62..2a20eea 100644 --- a/lib/logstash/filters/useragent.rb +++ b/lib/logstash/filters/useragent.rb @@ -1,9 +1,7 @@ # encoding: utf-8 -require "java" require "logstash-filter-useragent_jars" require "logstash/filters/base" require "logstash/namespace" -require "tempfile" require "thread" # Parse user agent strings into structured data based on BrowserScope data @@ -55,29 +53,32 @@ class LogStash::Filters::UserAgent < LogStash::Filters::Base # number of cache misses and waste memory. config :lru_cache_size, :validate => :number, :default => 100_000 - def register + def initialize(*params) + super + + # make @target in the format [field name] if defined, i.e. surrounded by brackets + target = @target || '' + target = "[#{@target}]" if !target.empty? && target !~ /^\[[^\[\]]+\]$/ + + # predefine prefixed field names + @prefixed_name = "#{target}[#{@prefix}name]" + @prefixed_os = "#{target}[#{@prefix}os]" + @prefixed_os_name = "#{target}[#{@prefix}os_name]" + @prefixed_os_major = "#{target}[#{@prefix}os_major]" + @prefixed_os_minor = "#{target}[#{@prefix}os_minor]" + @prefixed_device = "#{target}[#{@prefix}device]" + @prefixed_major = "#{target}[#{@prefix}major]" + @prefixed_minor = "#{target}[#{@prefix}minor]" + @prefixed_patch = "#{target}[#{@prefix}patch]" + end + def register if @regexes.nil? @parser = org.logstash.uaparser.CachingParser.new(lru_cache_size) else @logger.debug("Using user agent regexes", :regexes => @regexes) @parser = org.logstash.uaparser.CachingParser.new(@regexes, lru_cache_size) end - - # make @target in the format [field name] if defined, i.e. surrounded by brakets - normalized_target = (@target && @target !~ /^\[[^\[\]]+\]$/) ? "[#{@target}]" : "" - - # predefine prefixed field names - @prefixed_name = "#{normalized_target}[#{@prefix}name]" - @prefixed_os = "#{normalized_target}[#{@prefix}os]" - @prefixed_os_name = "#{normalized_target}[#{@prefix}os_name]" - @prefixed_os_major = "#{normalized_target}[#{@prefix}os_major]" - @prefixed_os_minor = "#{normalized_target}[#{@prefix}os_minor]" - @prefixed_device = "#{normalized_target}[#{@prefix}device]" - @prefixed_major = "#{normalized_target}[#{@prefix}major]" - @prefixed_minor = "#{normalized_target}[#{@prefix}minor]" - @prefixed_patch = "#{normalized_target}[#{@prefix}patch]" - @prefixed_build = "#{normalized_target}[#{@prefix}build]" end def filter(event) @@ -88,8 +89,10 @@ def filter(event) begin ua_data = lookup_useragent(useragent) - rescue StandardError => e - @logger.error("Uknown error while parsing user agent data", :exception => e, :field => @source, :event => event) + rescue => e + @logger.error("Unknown error while parsing user agent data", + :exception => e.class, :message => e.message, :backtrace => e.backtrace, + :field => @source, :event => event.to_hash) return end @@ -101,47 +104,45 @@ def filter(event) filter_matched(event) end - # should be private but need to stay public for specs - # TODO: (colin) the related specs should be refactored to not rely on private methods. - def lookup_useragent(useragent) - return unless useragent + private + def lookup_useragent(useragent) # the UserAgentParser::Parser class is not thread safe, indications are that it is probably # caused by the underlying JRuby regex code that is not thread safe. # see https://github.com/logstash-plugins/logstash-filter-useragent/issues/25 @parser.parse(useragent) end - private - def set_fields(event, ua_data) # UserAgentParser outputs as US-ASCII. - event.set(@prefixed_name, ua_data.userAgent.family.dup.force_encoding(Encoding::UTF_8)) - - #OSX, Android and maybe iOS parse correctly, ua-agent parsing for Windows does not provide this level of detail + event.set(@prefixed_name, duped_string(ua_data.userAgent.family)) + event.set(@prefixed_device, duped_string(ua_data.device)) if ua_data.device - # Calls in here use #dup because there's potential for later filters to modify these values - # and corrupt the cache. See uap source here for details https://github.com/ua-parser/uap-ruby/tree/master/lib/user_agent_parser - if (os = ua_data.os) + os = ua_data.os + if os # The OS is a rich object - event.set(@prefixed_os, ua_data.os.family.dup.force_encoding(Encoding::UTF_8)) - event.set(@prefixed_os_name, os.family.dup.force_encoding(Encoding::UTF_8)) if os.family + event.set(@prefixed_os, duped_string(os.family)) + event.set(@prefixed_os_name, duped_string(os.family)) if os.family # These are all strings - if os.minor && os.major - event.set(@prefixed_os_major, os.major.dup.force_encoding(Encoding::UTF_8)) if os.major - event.set(@prefixed_os_minor, os.minor.dup.force_encoding(Encoding::UTF_8)) if os.minor - end + major, minor = os.major, os.minor + event.set(@prefixed_os_major, duped_string(major)) if major # e.g. 'Vista' or '10' + event.set(@prefixed_os_minor, duped_string(minor)) if minor end - event.set(@prefixed_device, ua_data.device.to_s.dup.force_encoding(Encoding::UTF_8)) if ua_data.device - - if (ua_version = ua_data.userAgent) - event.set(@prefixed_major, ua_version.major.dup.force_encoding(Encoding::UTF_8)) if ua_version.major - event.set(@prefixed_minor, ua_version.minor.dup.force_encoding(Encoding::UTF_8)) if ua_version.minor - event.set(@prefixed_patch, ua_version.patch.dup.force_encoding(Encoding::UTF_8)) if ua_version.patch - event.set(@prefixed_build, ua_version.patchMinor.dup.force_encoding(Encoding::UTF_8)) if ua_version.patchMinor + ua_version = ua_data.userAgent + if ua_version + event.set(@prefixed_major, duped_string(ua_version.major)) if ua_version.major + event.set(@prefixed_minor, duped_string(ua_version.minor)) if ua_version.minor + event.set(@prefixed_patch, duped_string(ua_version.patch)) if ua_version.patch end end + + def duped_string(str) + # Calls in here use #dup because there's potential for later filters to modify these values + # and corrupt the cache. See uap source here for details https://github.com/ua-parser/uap-ruby/tree/master/lib/user_agent_parser + str.dup.force_encoding(Encoding::UTF_8) + end + end diff --git a/logstash-filter-useragent.gemspec b/logstash-filter-useragent.gemspec index 07ae5ea..63a0f27 100644 --- a/logstash-filter-useragent.gemspec +++ b/logstash-filter-useragent.gemspec @@ -23,6 +23,5 @@ Gem::Specification.new do |s| # Gem dependencies s.add_runtime_dependency "logstash-core-plugin-api", ">= 1.60", "<= 2.99" s.add_development_dependency 'logstash-devutils' - s.add_development_dependency 'insist' end diff --git a/spec/filters/useragent_spec.rb b/spec/filters/useragent_spec.rb index c5ad6f2..835e9c8 100644 --- a/spec/filters/useragent_spec.rb +++ b/spec/filters/useragent_spec.rb @@ -1,11 +1,13 @@ # encoding: utf-8 - require "logstash/devutils/rspec/spec_helper" -require "insist" require "logstash/filters/useragent" describe LogStash::Filters::UserAgent do + subject { LogStash::Filters::UserAgent.new(options) } + + let(:options) { { "source" => "foo" } } + describe "defaults" do config <<-CONFIG filter { @@ -17,22 +19,117 @@ CONFIG sample "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.63 Safari/537.31" do - insist { subject }.include?("ua") - insist { subject.get("[ua][name]") } == "Chrome" - insist { subject.get("[ua][os]") } == "Linux" - insist { subject.get("[ua][major]") } == "26" - insist { subject.get("[ua][minor]") } == "0" + expect( subject.to_hash ).to include("ua") + expect( subject.get("[ua][name]") ).to eql "Chrome" + expect( subject.get("[ua][os]") ).to eql "Linux" + expect( subject.get("[ua][major]") ).to eql "26" + expect( subject.get("[ua][minor]") ).to eql "0" + expect( subject.get("[ua][device]") ).to eql "Other" + + expect( subject.get("[ua][minor]").encoding ).to eql Encoding::UTF_8 end sample "MacOutlook/16.24.0.190414 (Intelx64 Mac OS X Version 10.14.4 (Build 18E226))" do - insist { subject }.include?("ua") - insist { subject.get("[ua][name]") } == "MacOutlook" - insist { subject.get("[ua][major]") } == "16" - insist { subject.get("[ua][minor]") } == "24" - insist { subject.get("[ua][os]") } == "Mac OS X" - insist { subject.get("[ua][os_name]") } == "Mac OS X" - insist { subject.get("[ua][os_major]") } == "10" - insist { subject.get("[ua][os_minor]") } == "14" + expect( subject.to_hash ).to include("ua") + expect( subject.get("[ua][name]") ).to eql "MacOutlook" + expect( subject.get("[ua][major]") ).to eql "16" + expect( subject.get("[ua][minor]") ).to eql "24" + expect( subject.get("[ua][patch]") ).to eql "0" + expect( subject.get("[ua][os]") ).to eql "Mac OS X" + expect( subject.get("[ua][os_name]") ).to eql "Mac OS X" + expect( subject.get("[ua][os_major]") ).to eql '10' + expect( subject.get("[ua][os_minor]") ).to eql '14' + expect( subject.get("[ua][device]") ).to eql 'Mac' + + expect( subject.get("[ua][os_major]").encoding ).to eql Encoding::UTF_8 + end + + # Safari 12 on Mojave + sample "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15" do + expect( subject.to_hash ).to include("ua") + expect( subject.get("[ua][name]") ).to eql "Safari" + expect( subject.get("[ua][major]") ).to eql "12" + expect( subject.get("[ua][minor]") ).to eql "0" + expect( subject.get("[ua][patch]") ).to be nil + expect( subject.get("[ua][os]") ).to eql "Mac OS X" + expect( subject.get("[ua][os_major]") ).to eql '10' + expect( subject.get("[ua][os_minor]") ).to eql '14' + end + + # Safari 7 on Mac OS X (Mavericks) + sample "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A" do + expect( subject.to_hash ).to include("ua") + expect( subject.get("[ua][name]") ).to eql "Safari" + expect( subject.get("[ua][major]") ).to eql "7" + expect( subject.get("[ua][minor]") ).to eql "0" + expect( subject.get("[ua][patch]") ).to eql "3" + expect( subject.get("[ua][os]") ).to eql "Mac OS X" + expect( subject.get("[ua][os_major]") ).to eql '10' + expect( subject.get("[ua][os_minor]") ).to eql '9' + expect( subject.get("[ua][device]") ).to eql 'Mac' + end + + sample "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:45.0) Gecko/20100101 Firefox/45.0" do + expect( subject.to_hash ).to include("ua") + expect( subject.get("[ua][name]") ).to eql "Firefox" + expect( subject.get("[ua][major]") ).to eql "45" + expect( subject.get("[ua][minor]") ).to eql "0" + expect( subject.get("[ua][patch]") ).to be nil + expect( subject.get("[ua][os]") ).to eql "Mac OS X" + expect( subject.get("[ua][os_major]") ).to eql '10' + expect( subject.get("[ua][os_minor]") ).to eql '11' + expect( subject.get("[ua][device]") ).to eql 'Mac' + end + + # IE7 Vista + sample "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)" do + expect( subject.to_hash ).to include("ua") + expect( subject.get("[ua][os]") ).to eql "Windows" + expect( subject.get("[ua][os_major]") ).to eql 'Vista' + expect( subject.get("[ua][os_minor]") ).to be nil + expect( subject.get("[ua][device]") ).to eql 'Other' + + expect( subject.get("[ua][device]").encoding ).to eql Encoding::UTF_8 + end + + # IE8 XP + sample "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.5.30729)" do + expect( subject.to_hash ).to include("ua") + expect( subject.get("[ua][os]") ).to eql 'Windows' + expect( subject.get("[ua][os_major]") ).to eql 'XP' + expect( subject.get("[ua][os_minor]") ).to be nil + expect( subject.get("[ua][name]") ).to eql 'IE' + expect( subject.get("[ua][device]") ).to eql 'Other' + end + + # Windows 8.1 + sample "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246" do + expect( subject.to_hash ).to include("ua") + expect( subject.get("[ua][os]") ).to eql 'Windows' + expect( subject.get("[ua][os_major]") ).to eql '8' + expect( subject.get("[ua][os_minor]") ).to eql '1' + expect( subject.get("[ua][name]") ).to eql 'Edge' + expect( subject.get("[ua][device]") ).to eql 'Other' + end + + # Windows 10 + sample "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.50" do + expect( subject.to_hash ).to include("ua") + expect( subject.get("[ua][os]") ).to eql "Windows" + expect( subject.get("[ua][os_major]") ).to eql '10' + expect( subject.get("[ua][os_minor]") ).to be nil + expect( subject.get("[ua][name]") ).to eql 'Edge' + expect( subject.get("[ua][device]") ).to eql 'Other' + end + + # Chrome on Linux + sample "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36" do + expect( subject.to_hash ).to include("ua") + expect( subject.get("[ua][os]") ).to eql "Linux" + expect( subject.get("[ua][os_major]") ).to be nil + expect( subject.get("[ua][os_minor]") ).to be nil + expect( subject.get("[ua][name]") ).to eql 'Chrome' + expect( subject.get("[ua][device]") ).to eql 'Other' end end @@ -41,18 +138,18 @@ filter { useragent { source => "message" - target => "ua" + target => "[ua]" regexes => "build/resources/main/regexes.yaml" } } CONFIG sample "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.63 Safari/537.31" do - insist { subject }.include?("ua") - insist { subject.get("[ua][name]") } == "Chrome" - insist { subject.get("[ua][os]") } == "Linux" - insist { subject.get("[ua][major]") } == "26" - insist { subject.get("[ua][minor]") } == "0" + expect( subject.to_hash ).to include("ua") + expect( subject.get("[ua][name]") ).to eql "Chrome" + expect( subject.get("[ua][os]") ).to eql "Linux" + expect( subject.get("[ua][major]") ).to eql "26" + expect( subject.get("[ua][minor]") ).to eql "0" end end @@ -66,10 +163,30 @@ CONFIG sample "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.63 Safari/537.31" do - insist { subject.get("name") } == "Chrome" - insist { subject.get("os") } == "Linux" - insist { subject.get("major") } == "26" - insist { subject.get("minor") } == "0" + expect( subject.get("name") ).to eql "Chrome" + expect( subject.get("os") ).to eql "Linux" + expect( subject.get("major") ).to eql "26" + expect( subject.get("minor") ).to eql "0" + expect( subject.get("patch") ).to eql "1410" + end + end + + describe "nested target field" do + config <<-CONFIG + filter { + useragent { + source => "message" + target => "[foo][bar]" + } + } + CONFIG + + # Facebook App User Agent + sample "Mozilla/5.0 (iPhone; CPU iPhone OS 13_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) " + + "Mobile/15E148 [FBAN/FBIOS;FBDV/iPhone11,8;FBMD/iPhone;FBSN/iOS;FBSV/13.3.1;FBSS/2;FBID/phone;FBLC/en_US;FBOP/5;FBCR/]" do + expect( subject ).to include 'foo' + expect( subject.get('foo') ).to include 'bar' + expect( subject.get('foo')['bar'] ).to include "name" => "Facebook", "device" => "iPhone", "os" => "iOS" end end @@ -84,29 +201,28 @@ CONFIG sample "foo" => "bar" do - reject { subject }.include?("ua") + expect( subject.to_hash ).to_not include("ua") end sample "" do - reject { subject }.include?("ua") + expect( subject.to_hash ).to_not include("ua") end end + let(:ua_string) { "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36" } + let(:event) { LogStash::Event.new("foo" => ua_string) } + describe "LRU object identity" do - let(:ua_string) { "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36" } - let(:uafilter) { LogStash::Filters::UserAgent.new("source" => "foo") } - let(:ua_data) { uafilter.lookup_useragent(ua_string) } - subject(:target) { LogStash::Event.new("foo" => ua_string) } + let(:ua_data) { subject.send :lookup_useragent, ua_string } before do - uafilter.register + subject.register # Stub this out because this UA doesn't have this field allow(ua_data.userAgent).to receive(:patchMinor).and_return("foo") - # expect(event).receive(:lookup_useragent) - uafilter.filter(target) + subject.filter(event) end { @@ -119,11 +235,10 @@ "major" => lambda {|uad| uad.userAgent.major}, "minor" => lambda {|uad| uad.userAgent.minor}, "patch" => lambda {|uad| uad.userAgent.patch}, - "build" => lambda {|uad| uad.userAgent.patchMinor} }.each do |field, uad_getter| context "for the #{field} field" do - let(:value) {uad_getter.call(ua_data)} - let(:target_field) { target.get(field)} + let(:value) { uad_getter.call(ua_data) } + let(:target_field) { event.get(field) } it "should not have a nil value" do expect(target_field).to be_truthy @@ -136,6 +251,10 @@ it "should dup/clone the field to prevent cache corruption" do expect(target_field.object_id).not_to eql(value.object_id) end + + it "should be an utf-8 string" do + expect(target_field.encoding.name).to eql 'UTF-8' + end end end end @@ -151,11 +270,25 @@ CONFIG sample "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.63 Safari/537.31" do - insist { subject.to_hash }.include?("message") - insist { subject.get("[message][name]") } == "Chrome" - insist { subject.get("[message][os]") } == "Linux" - insist { subject.get("[message][major]") } == "26" - insist { subject.get("[message][minor]") } == "0" + expect( subject.to_hash ).to include("message") + expect( subject.get("[message][name]") ).to eql "Chrome" + expect( subject.get("[message][os]") ).to eql "Linux" + expect( subject.get("[message][major]") ).to eql "26" + expect( subject.get("[message][minor]") ).to eql "0" end end + + context 'exception handling' do + + before do + subject.register + expect(subject).to receive(:lookup_useragent).and_raise RuntimeError.new('this is a test') + end + + it 'errors do not propagate' do + expect(subject.logger).to receive(:error).with(/Unknown error while parsing user agent data/, hash_including(exception: RuntimeError, message: 'this is a test')) + expect { subject.filter(event) }.not_to raise_error + end + + end end diff --git a/src/main/java/org/logstash/uaparser/UserAgent.java b/src/main/java/org/logstash/uaparser/UserAgent.java index 005e369..40322e2 100644 --- a/src/main/java/org/logstash/uaparser/UserAgent.java +++ b/src/main/java/org/logstash/uaparser/UserAgent.java @@ -34,12 +34,6 @@ public final class UserAgent { public final String patch; - /** - * Placeholder: Currently unused by the Java code itself but part of the existing Ruby spec. - * @todo Refactor RSpec code to not need a mockable field on this class - */ - public final String patchMinor = ""; - public UserAgent(String family, String major, String minor, String patch) { this.family = family; this.major = major; diff --git a/version b/version index 351227f..15a2799 100644 --- a/version +++ b/version @@ -1 +1 @@ -3.2.4 +3.3.0