-
Notifications
You must be signed in to change notification settings - Fork 9
/
web_analyzer.rb
97 lines (83 loc) · 2.62 KB
/
web_analyzer.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# frozen_string_literal: true
require "http"
require "json"
require "oga"
require "uri"
require "simpleidn"
require "parallel"
module Ayashige
module Sources
class WebAnalyzer
BASE_URL = "https://wa-com.com"
TLDS = %w(com net org info us bid biz cat club download life live ltd men pro review shop stream tech today trade win world xyz).freeze
LIMIT = 5_000
def initialize
@store = Store.new
end
def already_stored?(date)
@store.exists? date
end
def store_newly_registered_domains
date = latest_indexed_date
if already_stored?(date)
puts "domains which updated on #{date} are already stored."
return
end
Parallel.each(TLDS) do |tld|
index = 1
while index < LIMIT
page = get_page(date, tld, index)
domains = get_domains_from_doc(page)
break if domains.empty?
domains.each do |elem|
domain = Domain.new(elem[:domain])
updated = elem[:updated]
next unless domain.suspicious?
@store.store updated, domain.to_s
puts "#{domain} is stored."
end
index += 1
end
end
end
def get_page(date, tld, index)
url = "#{BASE_URL}/#{date}/new-created-domains/#{tld}/p/#{index}"
res = HTTP.get(url)
html2doc(res.body.to_s)
end
def get_links_from_doc(doc)
pages = doc.css("#form1 > div.container.mt30 > div:nth-child(1) > div.col-lg-12.col-md-12.col-sm-12.col-xs-12.nopadding.mt10 > span")
pages.map do |page|
link = page.at_css("a")
link ? link.get("href") : nil
end.compact
end
def get_domains_from_doc(doc)
rows = doc.css("#tblListDomain > tbody > tr")
rows.map do |row|
cols = row.css("td")
domain = cols[0].at_css("a > span > span").text
updated = cols[1].at_css("span").text
{
domain: SimpleIDN.to_ascii(domain),
updated: updated
}
end
end
def latest_indexed_date
@latest_indexed_date ||= [].tap do |out|
res = HTTP.get(BASE_URL)
doc = html2doc(res.body.to_s)
break unless doc
time = doc.css("#form1 > div.container.mt30 > div:nth-child(3) > div:nth-child(2) > table > tbody > tr:nth-child(1) > td > div > span:nth-child(1) > span.date > time")
out << time.text
end.first
end
def html2doc(html)
Oga.parse_html html
rescue StandardError => _
nil
end
end
end
end