From 079aab90c2894240f5a4dd9be51b6a02c94a04a6 Mon Sep 17 00:00:00 2001 From: Manabu Niseki Date: Sun, 21 Oct 2018 15:50:02 +0900 Subject: [PATCH] chore: switch from `thread` to `parallel` --- lib/miteru/cli.rb | 55 +------------------------ lib/miteru/crawler.rb | 93 ++++++++++++++++++++++++++++++++++++------- miteru.gemspec | 2 +- spec/cli_spec.rb | 52 ------------------------ spec/crawler_spec.rb | 66 ++++++++++++++++++++++++++++-- 5 files changed, 142 insertions(+), 126 deletions(-) diff --git a/lib/miteru/cli.rb b/lib/miteru/cli.rb index 5621aa0..14bd407 100644 --- a/lib/miteru/cli.rb +++ b/lib/miteru/cli.rb @@ -17,64 +17,11 @@ class CLI < Thor method_option :verbose, type: :boolean, default: true desc "execute", "Execute the crawler" def execute - websites = Crawler.execute( - directory_traveling: options[:directory_traveling], - size: options[:size], - threads: options[:threads], - verbose: options[:verbose] - ) - websites.each do |website| - next unless website.has_kit? - - message = "#{website.url}: it might contain phishing kit(s) (#{website.compressed_files.join(', ')})." - puts message.colorize(:light_red) - post_to_slack(website.message) if options[:post_to_slack] && valid_slack_setting? - download_compressed_files(website.url, website.compressed_files, options[:download_to]) if options[:auto_download] - end + Crawler.execute options.map { |k, v| [k.to_sym, v] }.to_h end no_commands do - def download_compressed_files(url, compressed_files, base_dir) - compressed_files.each do |path| - target_url = "#{url}/#{path}" - begin - download_file_path = HTTPClient.download(target_url, base_dir) - if duplicated?(download_file_path, base_dir) - puts "Do not download #{target_url} because there is a same hash file in the directory (SHA256: #{sha256(download_file_path)})." - FileUtils.rm download_file_path - else - puts "Download #{target_url} as #{download_file_path}" - end - rescue Down::Error => e - puts "Failed to download: #{target_url} (#{e})" - end - end - end - - def sha256(path) - digest = Digest::SHA256.file(path) - digest.hexdigest - end - - def duplicated?(file_path, base_dir) - base = sha256(file_path) - sha256s = Dir.glob("#{base_dir}/*.zip").map { |path| sha256(path) } - sha256s.select { |sha256| sha256 == base }.length > 1 - end - - def valid_slack_setting? - ENV["SLACK_WEBHOOK_URL"] != nil - end - - def post_to_slack(message) - webhook_url = ENV["SLACK_WEBHOOK_URL"] - raise ArgumentError, "Please set the Slack webhook URL via SLACK_WEBHOOK_URL env" unless webhook_url - - channel = ENV["SLACK_CHANNEL"] || "#general" - payload = { text: message, channel: channel } - HTTP.post(webhook_url, json: payload) - end end end end diff --git a/lib/miteru/crawler.rb b/lib/miteru/crawler.rb index a5438b4..9aa5fc4 100644 --- a/lib/miteru/crawler.rb +++ b/lib/miteru/crawler.rb @@ -3,12 +3,14 @@ require "csv" require "http" require "json" -require "thread/pool" +require "parallel" require "uri" module Miteru class Crawler + attr_reader :auto_download attr_reader :directory_traveling + attr_reader :download_to attr_reader :size attr_reader :threads attr_reader :verbose @@ -17,8 +19,11 @@ class Crawler OPENPHISH_ENDPOINT = "https://openphish.com" PHISHTANK_ENDPOINT = "http://data.phishtank.com" - def initialize(directory_traveling: false, size: 100, threads: 10, verbose: false) + def initialize(auto_download: false, directory_traveling: false, download_to: "/tmp", post_to_slack: false, size: 100, threads: 10, verbose: false) + @auto_download = auto_download @directory_traveling = directory_traveling + @download_to = download_to + @post_to_slack = post_to_slack @size = size @threads = threads @verbose = verbose @@ -69,7 +74,7 @@ def breakdown(url) def suspicious_urls @suspicious_urls ||= [].tap do |arr| - urls = (urlscan_feed + openphish_feed + phishtank_feed) + urls = (urlscan_feed + openphish_feed + phishtank_feed).select { |url| url.start_with?("http://", "https://") } urls.map { |url| breakdown(url) }.flatten.uniq.sort.each { |url| arr << url } end end @@ -77,31 +82,89 @@ def suspicious_urls def execute puts "Loaded #{suspicious_urls.length} URLs to crawl." if verbose - pool = Thread.pool(threads) websites = [] + Parallel.each(suspicious_urls, in_threads: threads) do |url| + website = Website.new(url) + + if website.has_kit? + message = "#{website.url}: it might contain phishing kit(s) (#{website.compressed_files.join(', ')})." + puts message.colorize(:light_red) + post_message_to_slack(website.message) if post_to_slack? && valid_slack_setting? + download_compressed_files(website.url, website.compressed_files, download_to) if auto_download? + else + puts "#{website.url}: it doesn't contain a phishing kit." if verbose + end + break + rescue StandardError => e + puts "Failed to load #{url} (#{e})" if verbose + end + websites + end - suspicious_urls.each do |url| - pool.process do - website = Website.new(url) - if website.has_kit? - websites << website + def self.execute(auto_download: false, directory_traveling: false, download_to: "/tmp", post_to_slack: false, size: 100, threads: 10, verbose: false) + new( + auto_download: auto_download, + directory_traveling: directory_traveling, + download_to: download_to, + post_to_slack: post_to_slack, + size: size, + threads: threads, + verbose: verbose + ).execute + end + + def download_compressed_files(url, compressed_files, base_dir) + compressed_files.each do |path| + target_url = "#{url}/#{path}" + begin + download_file_path = HTTPClient.download(target_url, base_dir) + if duplicated?(download_file_path, base_dir) + puts "Do not download #{target_url} because there is a same hash file in the directory (SHA256: #{sha256(download_file_path)})." + FileUtils.rm download_file_path else - puts "#{website.url}: it doesn't contain a phishing kit." if verbose - website.unbuild + puts "Download #{target_url} as #{download_file_path}" end + rescue Down::Error => e + puts "Failed to download: #{target_url} (#{e})" end end - pool.shutdown + end - websites + def post_to_slack(message) + webhook_url = ENV["SLACK_WEBHOOK_URL"] + raise ArgumentError, "Please set the Slack webhook URL via SLACK_WEBHOOK_URL env" unless webhook_url + + channel = ENV["SLACK_CHANNEL"] || "#general" + + payload = { text: message, channel: channel } + HTTP.post(webhook_url, json: payload) + end + + def post_to_slack? + @post_to_slack + end + + def auto_download? + @auto_download end - def self.execute(directory_traveling: false, size: 100, threads: 10, verbose: false) - new(directory_traveling: directory_traveling, size: size, threads: threads, verbose: verbose).execute + def valid_slack_setting? + ENV["SLACK_WEBHOOK_URL"] != nil end private + def sha256(path) + digest = Digest::SHA256.file(path) + digest.hexdigest + end + + def duplicated?(file_path, base_dir) + base = sha256(file_path) + sha256s = Dir.glob("#{base_dir}/*.zip").map { |path| sha256(path) } + sha256s.select { |sha256| sha256 == base }.length > 1 + end + def get(url) res = HTTP.follow(max_hops: 3).get(url) raise HTTPResponseError if res.code != 200 diff --git a/miteru.gemspec b/miteru.gemspec index 61b3a1a..8fa19f4 100644 --- a/miteru.gemspec +++ b/miteru.gemspec @@ -36,6 +36,6 @@ Gem::Specification.new do |spec| spec.add_dependency "down", "~> 4.5" spec.add_dependency "http", "~> 3.3" spec.add_dependency "oga", "~> 2.15" + spec.add_dependency "parallel", "~> 1.12" spec.add_dependency "thor", "~> 0.19" - spec.add_dependency "thread", "~> 0.2.2" end diff --git a/spec/cli_spec.rb b/spec/cli_spec.rb index 3d8758c..67cb0bf 100644 --- a/spec/cli_spec.rb +++ b/spec/cli_spec.rb @@ -1,10 +1,7 @@ # frozen_string_literal: true RSpec.describe Miteru::CLI do - include_context "http_server" - include_context "download_compressed_files" subject { Miteru::CLI.new } - before(:each) { ENV.delete "SLACK_WEBHOOK_URL" } describe "#execute" do before do @@ -14,53 +11,4 @@ capture(:stdout) { Miteru::CLI.start %w(execute) } end end - - describe "#download_compressed_files" do - before { WebMock.disable! } - after { WebMock.enable! } - context "when it runs once" do - it "should download a file" do - url = "http://#{host}:#{port}/has_kit" - compressed_files = ["test.zip"] - expect(Dir.glob("#{base_dir}/*.zip").empty?).to be(true) - capture(:stdout) { subject.download_compressed_files(url, compressed_files, base_dir) } - download_files = Dir.glob("#{base_dir}/*.zip") - expect(download_files.empty?).to be(false) - expect(download_files.length).to eq(1) - end - end - context "when it runs multiple times" do - it "should remove duplicated files" do - url = "http://#{host}:#{port}/has_kit" - compressed_files = ["test.zip"] - expect(Dir.glob("#{base_dir}/*.zip").empty?).to be(true) - capture(:stdout) { subject.download_compressed_files(url, compressed_files, base_dir) } - capture(:stdout) { subject.download_compressed_files(url, compressed_files, base_dir) } - capture(:stdout) { subject.download_compressed_files(url, compressed_files, base_dir) } - download_files = Dir.glob("#{base_dir}/*.zip") - expect(download_files.empty?).to be(false) - expect(download_files.length).to eq(1) - end - end - end - describe "#valid_slack_setting?" do - context "when set ENV['SLACK_WEBHOOK_URL']" do - before { ENV["SLACK_WEBHOOK_URL"] = "test" } - it "should return true" do - expect(subject.valid_slack_setting?).to be(true) - end - end - context "when not set ENV['SLACK_WEBHOOK_URL']" do - it "should return false" do - expect(subject.valid_slack_setting?).to be(false) - end - end - end - describe "#post_to_slack" do - context "when not set ENV['SLACK_WEBHOOK_URL']" do - it "should return false" do - expect { subject.post_to_slack("test") }.to raise_error(ArgumentError) - end - end - end end diff --git a/spec/crawler_spec.rb b/spec/crawler_spec.rb index db62582..04c5880 100644 --- a/spec/crawler_spec.rb +++ b/spec/crawler_spec.rb @@ -2,6 +2,10 @@ RSpec.describe Miteru::Crawler, :vcr do include_context "http_server" + include_context "download_compressed_files" + + before(:each) { ENV.delete "SLACK_WEBHOOK_URL" } + subject { Miteru::Crawler } describe "#breakdown" do @@ -31,6 +35,7 @@ end end end + describe "#urlscan_feed" do context "without 'size' option" do it "should return an Array" do @@ -54,18 +59,21 @@ end end end + describe "#openphish_feed" do it "should return an Array" do results = subject.new.openphish_feed expect(results).to be_an(Array) end end + describe "#phishtank_feed" do it "should return an Array" do results = subject.new.phishtank_feed expect(results).to be_an(Array) end end + describe "#suspicious_urls" do it "should return an Array" do results = subject.new.suspicious_urls @@ -73,14 +81,64 @@ expect(results.length).to eq(results.uniq.length) end end + + describe "#download_compressed_files" do + before { WebMock.disable! } + after { WebMock.enable! } + context "when it runs once" do + it "should download a file" do + url = "http://#{host}:#{port}/has_kit" + compressed_files = ["test.zip"] + expect(Dir.glob("#{base_dir}/*.zip").empty?).to be(true) + capture(:stdout) { subject.new.download_compressed_files(url, compressed_files, base_dir) } + download_files = Dir.glob("#{base_dir}/*.zip") + expect(download_files.empty?).to be(false) + expect(download_files.length).to eq(1) + end + end + context "when it runs multiple times" do + it "should remove duplicated files" do + url = "http://#{host}:#{port}/has_kit" + compressed_files = ["test.zip"] + expect(Dir.glob("#{base_dir}/*.zip").empty?).to be(true) + capture(:stdout) { subject.new.download_compressed_files(url, compressed_files, base_dir) } + capture(:stdout) { subject.new.download_compressed_files(url, compressed_files, base_dir) } + capture(:stdout) { subject.new.download_compressed_files(url, compressed_files, base_dir) } + download_files = Dir.glob("#{base_dir}/*.zip") + expect(download_files.empty?).to be(false) + expect(download_files.length).to eq(1) + end + end + end + + describe "#valid_slack_setting?" do + context "when set ENV['SLACK_WEBHOOK_URL']" do + before { ENV["SLACK_WEBHOOK_URL"] = "test" } + it "should return true" do + expect(subject.new.valid_slack_setting?).to be(true) + end + end + context "when not set ENV['SLACK_WEBHOOK_URL']" do + it "should return false" do + expect(subject.new.valid_slack_setting?).to be(false) + end + end + end + + describe "#post_to_slack" do + context "when not set ENV['SLACK_WEBHOOK_URL']" do + it "should return false" do + expect { subject.new.post_to_slack("test") }.to raise_error(ArgumentError) + end + end + end + describe ".execute" do before do allow_any_instance_of(Miteru::Crawler).to receive(:suspicious_urls).and_return(["http://#{host}:#{port}/has_kit"]) end - it "should return an Array" do - results = subject.execute - expect(results).to be_an(Array) - expect(results.length).to eq(1) + it "should not raise any error" do + capture(:stdout) { expect { subject.execute }.to_not raise_error } end end end