From 9973efcb399de9e9530642c082f38043cbb0077a Mon Sep 17 00:00:00 2001 From: Mike Levin Date: Fri, 13 Mar 2026 07:31:52 -0400 Subject: [PATCH] When we mine for new bots it's now a bit more open-ended than using the known-bots constraints --- foo_files.py | 2 +- .../honeybot/queries/mine_bots_heuristic.sql | 73 ------------------- remotes/honeybot/scripts/build_bot_miner.py | 0 3 files changed, 1 insertion(+), 74 deletions(-) delete mode 100644 remotes/honeybot/queries/mine_bots_heuristic.sql mode change 100644 => 100755 remotes/honeybot/scripts/build_bot_miner.py diff --git a/foo_files.py b/foo_files.py index a80415d1..f1c5fbca 100644 --- a/foo_files.py +++ b/foo_files.py @@ -309,7 +309,7 @@ # ! echo "--- THE MARKDOWN DIET ---" && cat remotes/honeybot/queries/md_diet.sql | ssh honeybot 'sqlite3 -header -column ~/www/mikelev.in/honeybot.db' # ! echo "--- TRAPDOOR IPS ---" && cat remotes/honeybot/queries/trapdoor_ips.sql | ssh honeybot 'sqlite3 -header -column ~/www/mikelev.in/honeybot.db' # ! echo "--- TRAPDOOR EVENTS ---" && cat remotes/honeybot/queries/telemetry_trapdoor_events.sql | ssh honeybot 'sqlite3 -header -column ~/www/mikelev.in/honeybot.db' -# ! echo "--- BOT MINER (Heuristic Scoring) ---" && cat remotes/honeybot/queries/mine_bots_heuristic.sql | ssh honeybot 'sqlite3 -header -column ~/www/mikelev.in/honeybot.db' +# ! echo "--- BOT MINER (Heuristic Scoring) ---" && python remotes/honeybot/scripts/build_bot_miner.py | ssh honeybot 'sqlite3 -header -column ~/www/mikelev.in/honeybot.db' # ! echo "--- UNKNOWN AGENTS (Empty/Generic UAs) ---" && cat remotes/honeybot/queries/intel_unknown_agents.sql | ssh honeybot 'sqlite3 -header -column ~/www/mikelev.in/honeybot.db' # ! echo "--- HOSTILE DICTIONARY (Probes) ---" && cat remotes/honeybot/queries/intel_hostile_dictionary.sql | ssh honeybot 'sqlite3 -header -column ~/www/mikelev.in/honeybot.db' # ! echo "--- NOISE 404s (PHP/WP Probes) ---" && cat remotes/honeybot/queries/intel_noise_404s.sql | ssh honeybot 'sqlite3 -header -column ~/www/mikelev.in/honeybot.db' diff --git a/remotes/honeybot/queries/mine_bots_heuristic.sql b/remotes/honeybot/queries/mine_bots_heuristic.sql deleted file mode 100644 index 2bb72420..00000000 --- a/remotes/honeybot/queries/mine_bots_heuristic.sql +++ /dev/null @@ -1,73 +0,0 @@ -SELECT - ua.value as suspicious_agent, - SUM(logs.count) as total_hits, - ( - -- Heuristic 1: Suspicious Keywords (+10) - (CASE WHEN ua.value LIKE '%bot%' - OR ua.value LIKE '%crawl%' - OR ua.value LIKE '%spider%' - OR ua.value LIKE '%fetch%' - OR ua.value LIKE '%scrape%' - OR ua.value LIKE '%search%' - OR ua.value LIKE '%preview%' - OR ua.value LIKE '%monitor%' - OR ua.value LIKE '%http%' - OR ua.value LIKE '%python%' - OR ua.value LIKE '%curl%' - OR ua.value LIKE '%wget%' - OR ua.value LIKE '%headless%' - OR ua.value LIKE '%puppeteer%' - OR ua.value LIKE '%selenium%' - THEN 10 ELSE 0 END) + - - -- Heuristic 2: Non-Mozilla Format (+5) - (CASE WHEN ua.value NOT LIKE '%Mozilla%' THEN 5 ELSE 0 END) + - - -- Heuristic 3: High Volume (+2 if over 50 hits) - (CASE WHEN SUM(logs.count) > 50 THEN 2 ELSE 0 END) + - - -- Heuristic 4: "Compatible" but not a standard browser (+5) - (CASE WHEN ua.value LIKE '%compatible%' - AND ua.value LIKE '%Mozilla%' - AND ua.value NOT LIKE '%Chrome%' - AND ua.value NOT LIKE '%Safari%' - AND ua.value NOT LIKE '%Firefox%' - THEN 5 ELSE 0 END) - ) as bot_score - -FROM daily_logs logs -JOIN user_agents ua ON logs.ua_id = ua.id -WHERE - -- 1. Exclude the "Orange List" (KNOWN BOTS) - ua.value NOT LIKE '%AhrefsBot%' - AND ua.value NOT LIKE '%Amazonbot%' - AND ua.value NOT LIKE '%Applebot%' - AND ua.value NOT LIKE '%Baiduspider%' - AND ua.value NOT LIKE '%Bytespider%' - AND ua.value NOT LIKE '%ChatGPT-User%' - AND ua.value NOT LIKE '%ClaudeBot%' - AND ua.value NOT LIKE '%DataForSeoBot%' - AND ua.value NOT LIKE '%GPTBot%' - AND ua.value NOT LIKE '%Google-Safety%' - AND ua.value NOT LIKE '%Googlebot%' - AND ua.value NOT LIKE '%KagiApp%' - AND ua.value NOT LIKE '%MJ12bot%' - AND ua.value NOT LIKE '%OAI-SearchBot%' - AND ua.value NOT LIKE '%Perplexity%' - AND ua.value NOT LIKE '%PetalBot%' - AND ua.value NOT LIKE '%PromptingBot%' - AND ua.value NOT LIKE '%SemrushBot%' - AND ua.value NOT LIKE '%SeznamBot%' - AND ua.value NOT LIKE '%TikTokSpider%' - AND ua.value NOT LIKE '%Twitterbot%' - AND ua.value NOT LIKE '%Yandex%' - AND ua.value NOT LIKE '%YisouSpider%' - AND ua.value NOT LIKE '%axios%' - AND ua.value NOT LIKE '%bingbot%' - AND ua.value NOT LIKE '%meta-externalagent%' - -GROUP BY ua.id --- Only show things that triggered at least one heuristic rule -HAVING bot_score > 0 -ORDER BY bot_score DESC, total_hits DESC -LIMIT 50; diff --git a/remotes/honeybot/scripts/build_bot_miner.py b/remotes/honeybot/scripts/build_bot_miner.py old mode 100644 new mode 100755