Skip to content

Commit

Permalink
Merge 983481f into a713854
Browse files Browse the repository at this point in the history
  • Loading branch information
ninoseki committed Nov 27, 2018
2 parents a713854 + 983481f commit 6834492
Show file tree
Hide file tree
Showing 16 changed files with 2,419 additions and 63 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ gem "public_suffix"
gem "puma"
gem "redis"
gem "rollbar"
gem "rubyzip"
gem "simpleidn"
gem "sinatra-contrib"
gem "sinatra"
Expand Down
2 changes: 2 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ GEM
ruby-ll (2.1.2)
ansi
ast
rubyzip (1.2.2)
safe_yaml (1.0.4)
simplecov (0.16.1)
docile (~> 1.1)
Expand Down Expand Up @@ -135,6 +136,7 @@ DEPENDENCIES
redis
rollbar
rspec
rubyzip
simpleidn
sinatra
sinatra-contrib
Expand Down
11 changes: 0 additions & 11 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,3 @@ require "rspec/core/rake_task"

RSpec::Core::RakeTask.new(:spec)
task default: :spec

$LOAD_PATH.unshift("#{__dir__}/lib")
require "ayashige"

desc "Fetching domains via WebAnalyzer"
task :fetch_domains_via_webanalyzer do
puts "Fetching domains via WebAnalyzer..."
job = Ayashige::Jobs::CronJob.new
job.perform
puts "done."
end
8 changes: 5 additions & 3 deletions bin/cron_job.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@

require "ayashige"

begin
job = Ayashige::Jobs::CronJob.new
job.perform
def with_error_handling
yield
rescue StandardError => e
if Ayashige::Rollbar.available?
Rollbar.error e
else
puts e
end
end

with_error_handling { Ayashige::Sources::WebAnalyzer.new.store_newly_registered_domains }
with_error_handling { Ayashige::Sources::WhoisDS.new.store_newly_registered_domains }
4 changes: 2 additions & 2 deletions lib/ayashige.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@

require "ayashige/store"

require "ayashige/sources/source"
require "ayashige/sources/web_analyzer"
require "ayashige/sources/whoisds"

require "ayashige/rollbar"

require "ayashige/application"

require "ayashige/jobs/cron_job"

require "rollbar"

if Ayashige::Rollbar.available?
Expand Down
11 changes: 0 additions & 11 deletions lib/ayashige/jobs/cron_job.rb

This file was deleted.

32 changes: 32 additions & 0 deletions lib/ayashige/sources/source.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# frozen_string_literal: true

require "http"
require "oga"

module Ayashige
module Sources
class Source
def initialize
@store = Store.new
end

def store_newly_registered_domains
raise NotImplementedError, "You must implement #{self.class}##{__method__}"
end

private

def html2doc(html)
Oga.parse_html html
rescue StandardError => _
nil
end

def xml2doc(xml)
Oga.parse_xml xml
rescue StandardError => _
nil
end
end
end
end
24 changes: 2 additions & 22 deletions lib/ayashige/sources/web_analyzer.rb
Original file line number Diff line number Diff line change
@@ -1,33 +1,19 @@
# frozen_string_literal: true

require "http"
require "json"
require "oga"
require "uri"
require "simpleidn"
require "parallel"

module Ayashige
module Sources
class WebAnalyzer
class WebAnalyzer < Source
BASE_URL = "https://wa-com.com"
TLDS = %w(com net org info us bid biz cat club download life live ltd men pro review shop stream tech today trade win world xyz).freeze
LIMIT = 5_000

def initialize
@store = Store.new
end

def already_stored?(date)
@store.exists? date
end

def store_newly_registered_domains
date = latest_indexed_date
if already_stored?(date)
puts "domains which updated on #{date} are already stored."
return
end

Parallel.each(TLDS) do |tld|
index = 1
Expand All @@ -42,7 +28,7 @@ def store_newly_registered_domains
next unless domain.suspicious?

@store.store updated, domain.to_s, domain.score
puts "#{domain} is stored."
puts "WebAnalyzer: #{domain} is stored."
end
index += 1
end
Expand Down Expand Up @@ -86,12 +72,6 @@ def latest_indexed_date
out << time.text
end.first
end

def html2doc(html)
Oga.parse_html html
rescue StandardError => _
nil
end
end
end
end
65 changes: 65 additions & 0 deletions lib/ayashige/sources/whoisds.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# frozen_string_literal: true

require "zip"

module Ayashige
module Sources
class WhoisDS < Source
BASE_URL = "https://whoisds.com/newly-registered-domains"

def store_newly_registered_domains
date = latest_indexed_date
res = HTTP.get(latest_zip_file_link)
lines = unzip(res.body.to_s)

lines.each do |line|
domain = Domain.new(line)
next unless domain.suspicious?

@store.store date, domain.to_s, domain.score
puts "WhoisDS: #{domain} is stored."
end
end

def unzip(data)
fin = StringIO.new(data)

lines = []
Zip::InputStream.open(fin) do |zip_file|
while (entry = zip_file.get_next_entry)
content = entry.get_input_stream.read
lines << content.lines.map(&:chomp)
end
end
lines.flatten
end

def doc
@doc ||= [].tap do |out|
res = HTTP.get(BASE_URL)
out << xml2doc(res.body.to_s)
end.first
end

def latest_zip_file_link
@latest_zip_file_link ||= [].tap do |out|
break unless doc

table = doc.at_css("table")
a = table.at_css("tr > td > a")
out << a.get("href") if a
end.first
end

def latest_indexed_date
@latest_indexed_date ||= [].tap do |out|
break unless doc

table = doc.at_css("table")
td = table.at_css("tr > td")
out << td.text.split.first if td
end.first
end
end
end
end
Binary file added spec/fixtures/archive.zip
Binary file not shown.

0 comments on commit 6834492

Please sign in to comment.