Skip to content

Commit

Permalink
Robot imitators
Browse files Browse the repository at this point in the history
  • Loading branch information
nielsbasjes committed Mar 11, 2019
1 parent 0c5ac1d commit 0fb59d6
Show file tree
Hide file tree
Showing 17 changed files with 886 additions and 304 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Expand Up @@ -21,6 +21,8 @@ v5.9-SNAPSHOT
- Apache Hive 3.1.1
- Massive improvement in detection of URLs.
- New more realistic performance benchmark test
- Renamed DeviceClass "Spy" to "Robot Imitator"
- More consistently add the DeviceBrand to the DeviceName

v5.8
===
Expand Down
Expand Up @@ -136,7 +136,7 @@ fragment TLD : 'aaa' | 'aarp' | 'abb' | 'abc' | 'able' | 'ac' | 'aco' | 'ad' |

fragment OtherTLDLikeEnds : 'htm' | 'html' | 'php';

fragment UrlHostname: [a-zA-Z\-_] [a-zA-Z0-9\-_]+ ('.'[a-zA-Z0-9\-_]+)* '.' ( TLD | OtherTLDLikeEnds ) ;
fragment UrlHostname: 'localhost' | ( [a-zA-Z\-_] [a-zA-Z0-9\-_]+ ('.'[a-zA-Z0-9\-_]+)* '.' ( TLD | OtherTLDLikeEnds )) ;
fragment UrlPathA : ('/'|'?') [a-zA-Z] [a-zA-Z0-9\-_~=?&%+.:/#]* ;
fragment UrlPathN : ('/'|'?') [0-9][a-zA-Z0-9\-_]* '/' [a-zA-Z0-9\-_~=?&%+.:/#]* ;
fragment BasicURL : ((('http'|'ftp') 's'? ':')? '//' )? UrlHostname (':'[0-9]+)? (UrlPathA|UrlPathN)? ;
Expand Down
Expand Up @@ -940,6 +940,17 @@ private UserAgent hardCodedPostProcessing(UserAgent userAgent) {
email.getConfidence());
}

if (deviceBrand.getConfidence() < 0) {
// If no brand is known then try to extract something that looks like a Brand from things like URL and Email addresses.
String newDeviceBrand = determineDeviceBrand(userAgent);
if (newDeviceBrand != null) {
userAgent.setForced(
DEVICE_BRAND,
newDeviceBrand,
1);
}
}

// Make sure the DeviceName always starts with the DeviceBrand
UserAgent.AgentField deviceName = userAgent.get(DEVICE_NAME);
if (deviceName.getConfidence() >= 0) {
Expand All @@ -961,16 +972,6 @@ private UserAgent hardCodedPostProcessing(UserAgent userAgent) {
deviceName.getConfidence());
}

if (deviceBrand.getConfidence() < 0) {
// If no brand is known then try to extract something that looks like a Brand from things like URL and Email addresses.
String newDeviceBrand = determineDeviceBrand(userAgent);
if (newDeviceBrand != null) {
userAgent.setForced(
DEVICE_BRAND,
newDeviceBrand,
1);
}
}

return userAgent;
}
Expand Down
Expand Up @@ -77,7 +77,7 @@ public enum DeviceClass {
/**
* Robots that visit the site pretending they are robots like google, but they are not.
*/
Spy("Spy"),
RobotImitator("Robot Imitator"),
/**
* In case scripting is detected in the useragent string, also fallback in really broken situations.
*/
Expand Down
Expand Up @@ -30,7 +30,7 @@
import static nl.basjes.parse.useragent.classify.DeviceClass.Robot;
import static nl.basjes.parse.useragent.classify.DeviceClass.RobotMobile;
import static nl.basjes.parse.useragent.classify.DeviceClass.SetTopBox;
import static nl.basjes.parse.useragent.classify.DeviceClass.Spy;
import static nl.basjes.parse.useragent.classify.DeviceClass.RobotImitator;
import static nl.basjes.parse.useragent.classify.DeviceClass.TV;
import static nl.basjes.parse.useragent.classify.DeviceClass.Tablet;
import static nl.basjes.parse.useragent.classify.DeviceClass.Unclassified;
Expand Down Expand Up @@ -58,7 +58,7 @@ public static DeviceClass getDeviceClass(UserAgent userAgent) {
case "Handheld Game Console": return HandheldGameConsole;
case "Robot": return Robot;
case "Robot Mobile": return RobotMobile;
case "Spy": return Spy;
case "Robot Imitator": return RobotImitator;
case "Hacker": return Hacker;
case "Unknown": return Unknown;
default: return Unclassified;
Expand Down Expand Up @@ -87,7 +87,7 @@ public static boolean isNormalConsumerDevice(UserAgent userAgent) {
case Anonymized:
case Robot:
case RobotMobile:
case Spy:
case RobotImitator:
case Hacker:
case Unknown:
case Unclassified:
Expand Down Expand Up @@ -118,7 +118,7 @@ public static boolean isMobile(UserAgent userAgent) {
case GameConsole:
case Anonymized:
case Robot:
case Spy:
case RobotImitator:
case Hacker:
case Unknown:
case Unclassified:
Expand Down Expand Up @@ -149,7 +149,7 @@ public static boolean isHuman(UserAgent userAgent) {

case Robot:
case RobotMobile:
case Spy:
case RobotImitator:
case Hacker:
case Unknown:
case Unclassified:
Expand All @@ -165,7 +165,7 @@ public static boolean isHuman(UserAgent userAgent) {
public static boolean isDeliberateMisuse(UserAgent userAgent) {
switch (getDeviceClass(userAgent)) {
case Anonymized:
case Spy:
case RobotImitator:
case Hacker:
return true;

Expand Down
Expand Up @@ -95,7 +95,7 @@ public static String brand(String brand) {
}
}
}
return sb.toString();
return sb.toString().trim();
}

public static String cleanupDeviceBrandName(String deviceBrand, String deviceName) {
Expand Down
60 changes: 30 additions & 30 deletions analyzer/src/main/resources/UserAgents/BrokenUseragents.yaml
Expand Up @@ -480,7 +480,7 @@ config:
user_agent_string: 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; Noldus B.V. {www.Noldus.com}; rv:11.0) like Gecko'
expected:
DeviceClass : 'Desktop'
DeviceName : 'Desktop'
DeviceName : 'Noldus Desktop'
DeviceBrand : 'Noldus'
DeviceCpuBits : '32'
OperatingSystemClass : 'Desktop'
Expand Down Expand Up @@ -534,7 +534,7 @@ config:
user_agent_string: '/5.0 (Windows; U; Windows NT 5.1; en-ZW) AppleWebKit/534.34 (KHTML, like Gecko) QtWeb Internet Browser/3.8.5 http://www.QtWeb.net'
expected:
DeviceClass : 'Desktop'
DeviceName : 'Desktop'
DeviceName : 'Qtweb Desktop'
DeviceBrand : 'Qtweb'
DeviceCpuBits : '32'
OperatingSystemClass : 'Desktop'
Expand Down Expand Up @@ -766,7 +766,7 @@ config:
user_agent_string: 'WordPress/3.5; http://smartphone.ro'
expected:
DeviceClass : 'Robot'
DeviceName : 'Robot'
DeviceName : 'Smartphone Robot'
DeviceBrand : 'Smartphone'
OperatingSystemClass : 'Cloud'
OperatingSystemName : 'Cloud'
Expand Down Expand Up @@ -800,7 +800,7 @@ config:
user_agent_string: 'WordPress.com; http://momspiration.nl'
expected:
DeviceClass : 'Robot'
DeviceName : 'Robot'
DeviceName : 'Momspiration Robot'
DeviceBrand : 'Momspiration'
OperatingSystemClass : 'Cloud'
OperatingSystemName : 'Cloud'
Expand Down Expand Up @@ -880,7 +880,7 @@ config:
user_agent_string: 'Mozilla/5.0 (compatible; Abonti/0.91 - http://www.abonti.com)'
expected:
DeviceClass : 'Robot'
DeviceName : 'Robot'
DeviceName : 'Abonti Robot'
DeviceBrand : 'Abonti'
OperatingSystemClass : 'Cloud'
OperatingSystemName : 'Cloud'
Expand Down Expand Up @@ -1623,7 +1623,7 @@ config:
user_agent_string: '"      ltx71 - (http://ltx71.com/)"'
expected:
DeviceClass : 'Robot'
DeviceName : 'Robot'
DeviceName : 'LTX71 Robot'
DeviceBrand : 'LTX71'
OperatingSystemClass : 'Cloud'
OperatingSystemName : 'Cloud'
Expand Down Expand Up @@ -2075,30 +2075,30 @@ config:
input:
user_agent_string: 'Mozilla/5.0 (Linux; U; Android Zoe ROM 7.7 - CM7.2 - SDSL v4; pl-pl; LG-GT540 Build/GWK74; -=Zoe ROM=-) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'
expected:
DeviceClass : 'Mobile'
DeviceName : 'LG GT540'
DeviceBrand : 'LG'
OperatingSystemClass : 'Mobile'
OperatingSystemName : 'Android'
OperatingSystemVersion : '7.7'
OperatingSystemNameVersion : 'Android 7.7'
OperatingSystemVersionBuild : 'GWK74'
LayoutEngineClass : 'Browser'
LayoutEngineName : 'Mozilla'
LayoutEngineVersion : '5.0'
LayoutEngineVersionMajor : '5'
LayoutEngineNameVersion : 'Mozilla 5.0'
LayoutEngineNameVersionMajor : 'Mozilla 5'
AgentClass : 'Browser'
AgentName : 'Android Zoe ROM' # 10 | FAILED; Should be 'CM7.2'
AgentVersion : '7.7' # 10 | FAILED; Should be 'SDSL'
AgentVersionMajor : '7' # 10 | FAILED; Should be 'SDSL'
AgentNameVersion : 'Android Zoe ROM 7.7' # 10 | FAILED; Should be 'CM7.2 SDSL'
AgentNameVersionMajor : 'Android Zoe ROM 7' # 10 | FAILED; Should be 'CM7.2 SDSL'
AgentLanguage : 'Polish (Poland)'
AgentLanguageCode : 'pl-pl'
AgentSecurity : 'Strong security'
__SyntaxError__ : 'true'
DeviceClass : 'Mobile'
DeviceName : 'LG GT540'
DeviceBrand : 'LG'
OperatingSystemClass : 'Mobile'
OperatingSystemName : 'Android'
OperatingSystemVersion : '7.7'
OperatingSystemNameVersion : 'Android 7.7'
OperatingSystemVersionBuild : 'GWK74'
LayoutEngineClass : 'Browser'
LayoutEngineName : 'Mozilla'
LayoutEngineVersion : '5.0'
LayoutEngineVersionMajor : '5'
LayoutEngineNameVersion : 'Mozilla 5.0'
LayoutEngineNameVersionMajor : 'Mozilla 5'
AgentClass : 'Browser'
AgentName : 'Android Zoe ROM' # 10 | FAILED; Should be 'CM7.2'
AgentVersion : '7.7' # 10 | FAILED; Should be 'SDSL'
AgentVersionMajor : '7' # 10 | FAILED; Should be 'SDSL'
AgentNameVersion : 'Android Zoe ROM 7.7' # 10 | FAILED; Should be 'CM7.2 SDSL'
AgentNameVersionMajor : 'Android Zoe ROM 7' # 10 | FAILED; Should be 'CM7.2 SDSL'
AgentLanguage : 'Polish (Poland)'
AgentLanguageCode : 'pl-pl'
AgentSecurity : 'Strong security'
__SyntaxError__ : 'true'

# This vendor sends out broken useragents by default: missing ')' just after the "Build/VM".
- test:
Expand Down
6 changes: 3 additions & 3 deletions analyzer/src/main/resources/UserAgents/EMailClients.yaml
Expand Up @@ -961,9 +961,9 @@ config:
variable:
- 'Dell: agent.product.comments.entry.product.name[1]="Dell"@'
extract:
- 'DeviceClass : 1000 : "Desktop"'
- 'DeviceName : 1000 : @Dell^'
- 'DeviceBrand : 1000 : @Dell'
- 'DeviceClass : 1000 : "Desktop"'
- 'DeviceName : 1000 : @Dell^'
- 'DeviceBrand : 1000 : @Dell'


- test:
Expand Down
80 changes: 0 additions & 80 deletions analyzer/src/main/resources/UserAgents/Hackers.yaml
Expand Up @@ -721,86 +721,6 @@ config:
HackerAttackVector : 'Unknown'
HackerToolkit : 'Unknown'

# Hackers immitating googlebot --> wrong url
- matcher:
require:
- 'agent.(1)product.(1)comments.entry.url="http://www.googlebot.com/bot.html"'
extract:
- 'DeviceClass : 10001 :"Spy"'
- 'DeviceName : 10001 :"Hacker Googlebot Immitator"'
- 'DeviceBrand : 10001 :"Hacker"'
- 'DeviceVersion : 10001 :"Hacker"'
- 'OperatingSystemClass : 10001 :"Hacker"'
- 'OperatingSystemName : 10001 :"Hacker"'
- 'OperatingSystemVersion : 10001 :"Hacker"'
- 'OperatingSystemNameVersion : 10001 :"Hacker"'
- 'LayoutEngineClass : 10001 :"Hacker"'
- 'LayoutEngineName : 10001 :"Hacker"'
- 'LayoutEngineVersion : 10001 :"Hacker"'
- 'LayoutEngineVersionMajor : 10001 :"Hacker"'
- 'AgentClass : 10001 :"Hacker"'
- 'AgentName : 10001 :"Hacker"'
- 'AgentVersion : 10001 :"Hacker"'
- 'AgentVersionMajor : 10001 :"Hacker"'
- 'HackerAttackVector : 10001 :"Spy scraper"'
- 'HackerToolkit : 10001 :"Googlebot immitator"'
- 'AgentNameVersion : 10001 :"Hacker"'
- 'AgentNameVersionMajor : 10001 :"Hacker"'
- 'AgentInformationUrl : 10001 :"<<<null>>>"'

- test:
input:
user_agent_string: 'Mozilla/5.0 (compatible; Googlebot/2.1 +http://www.googlebot.com/bot.html)'
expected:
DeviceClass : 'Spy'
DeviceName : 'Hacker Googlebot Immitator'
DeviceBrand : 'Hacker'
DeviceVersion : 'Hacker'
OperatingSystemClass : 'Hacker'
OperatingSystemName : 'Hacker'
OperatingSystemVersion : 'Hacker'
OperatingSystemNameVersion : 'Hacker'
LayoutEngineClass : 'Hacker'
LayoutEngineName : 'Hacker'
LayoutEngineVersion : 'Hacker'
LayoutEngineVersionMajor : 'Hacker'
LayoutEngineNameVersion : 'Hacker'
LayoutEngineNameVersionMajor : 'Hacker'
AgentClass : 'Hacker'
AgentName : 'Hacker'
AgentVersion : 'Hacker'
AgentVersionMajor : 'Hacker'
AgentNameVersion : 'Hacker'
AgentNameVersionMajor : 'Hacker'
HackerAttackVector : 'Spy scraper'
HackerToolkit : 'Googlebot immitator'

- test:
input:
user_agent_string: 'Googlebot/2.1 (+http://www.googlebot.com/bot.html)'
expected:
DeviceClass : 'Spy'
DeviceName : 'Hacker Googlebot Immitator'
DeviceBrand : 'Hacker'
DeviceVersion : 'Hacker'
OperatingSystemClass : 'Hacker'
OperatingSystemName : 'Hacker'
OperatingSystemVersion : 'Hacker'
OperatingSystemNameVersion : 'Hacker'
LayoutEngineClass : 'Hacker'
LayoutEngineName : 'Hacker'
LayoutEngineVersion : 'Hacker'
LayoutEngineVersionMajor : 'Hacker'
LayoutEngineNameVersion : 'Hacker'
LayoutEngineNameVersionMajor : 'Hacker'
AgentClass : 'Hacker'
AgentName : 'Hacker'
AgentVersion : 'Hacker'
AgentVersionMajor : 'Hacker'
AgentNameVersion : 'Hacker'
AgentNameVersionMajor : 'Hacker'
HackerAttackVector : 'Spy scraper'
HackerToolkit : 'Googlebot immitator'

- matcher:
# options:
Expand Down

0 comments on commit 0fb59d6

Please sign in to comment.