-
Notifications
You must be signed in to change notification settings - Fork 434
/
spider_test.rb
175 lines (150 loc) · 5.3 KB
/
spider_test.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
require_relative '../../test_helper'
require 'benchmark'
require 'nokogiri'
class Webui::SpiderTest < Webui::IntegrationTest
def ignore_link?(link)
return true if link =~ %r{/mini-profiler-resources}
# that link is just a top ref
return true if link =~ %r{/package/rdiff}
# admin can see even the hidden
return true if link.end_with?('/package/show/HiddenRemoteInstance')
return true if link =~ %r{/package/show/SourceprotectedProject}
# this is crashing (bug)
return true if link =~ %r{/package/show/UseRemoteInstance}
return true if link.end_with?('/project/show/HiddenRemoteInstance')
return true if link.end_with?('/project/show/RemoteInstance')
return true if link.end_with?('/package/show/BaseDistro3/pack2')
return true if link.end_with?('/project/show/BaseDistro3')
return true if link.end_with?('/package/show/home:Iggy/TestPack')
return true if link.end_with?('/package/show/home:Iggy/ToBeDeletedTestPack')
return true if link.end_with?('/project/show/home:Iggy')
return true if link.end_with?('/project/show/home:user6')
return true if link =~ %r{/live_build_log/BinaryprotectedProject}
return true if link =~ %r{/live_build_log/SourceprotectedProject}
return true if link =~ %r{/live_build_log/home:Iggy/ToBeDeletedTestPack}
return true if link =~ %r{/live_build_log}
# we do not really serve binary packages in the test environment
return true if link =~ %r{/package/binary/}
# apidocs is not configured in test environment
return true if link.end_with?('/apidocs/index')
end
def getlinks(baseuri, body)
# skip some uninteresting projects
return if baseuri =~ /project=home%3Afred/
return if baseuri =~ /project=home%3Acoolo/
return if baseuri =~ /project=deleted/
baseuri = URI.parse(baseuri)
body.traverse do |tag|
next unless tag.element?
next unless tag.name == 'a'
next if tag.attributes['data-remote']
next if tag.attributes['data-method']
link = tag.attributes['href']
begin
link = baseuri.merge(link)
rescue ArgumentError
# if merge does not like it, it's not a valid link
next
end
link.fragment = nil
link.normalize!
next unless link.host == baseuri.host
next unless link.port == baseuri.port
link = link.to_s
next if ignore_link?(link)
next if tag.content == 'show latest'
next if @pages_visited.key?(link)
next if @pages_to_visit.key?(link)
@pages_to_visit[link] = [baseuri.to_s, tag.content]
end
end
def raiseit(message, url)
# known issues
return if url =~ %r{/source/}
warn "Found #{message} on #{url}, crawling path"
indent = ' '
while @pages_visited.key?(url)
url, text = @pages_visited[url]
break if url.blank?
warn "#{indent}#{url} ('#{text}')"
indent += ' '
end
raise "Found #{message}"
end
def crawl
load_sitemap('/sitemaps')
until @pages_to_visit.empty?
theone = @pages_to_visit.keys.min
@pages_visited[theone] = @pages_to_visit[theone]
@pages_to_visit.delete theone
begin
# puts "V #{theone} #{@pages_to_visit.length}/#{@pages_visited.keys.length + @pages_to_visit.length}"
page.visit(theone)
if page.status_code != 200
raiseit("Status code #{page.status_code}", theone)
return
end
unless %r{text/html}.match?(page.response_headers['Content-Type'])
# puts "ignoring #{page.response_headers.inspect}"
next
end
page.first('.navbar-brand')
rescue Timeout::Error
next
rescue ActionController::RoutingError
raiseit('routing error', theone)
return
end
body = nil
begin
body = Nokogiri::HTML4::Document.parse(page.source).root
rescue Nokogiri::XML::SyntaxError
# puts "HARDCORE!! #{theone}"
end
next unless body
flashes = body.css('div#flash div.alert-error')
raiseit("flash alert #{flashes.first.content.strip}", theone) unless flashes.empty?
body.css('h1').each do |h|
raiseit('Internal Server Error', theone) if h.content == 'Internal Server Error'
end
body.css('h2').each do |h|
raiseit('XML errors', theone) if h.content == 'XML errors'
end
body.css('#exception-error').each do |e|
raiseit("error '#{e.content}'", theone)
end
getlinks(theone, body)
end
end
def load_sitemap(url)
page.visit(url)
return unless page.status_code == 200
r = Xmlhash.parse(page.source)
r.elements('sitemap') do |s|
load_sitemap(s['loc'])
end
r.elements('url') do |s|
next if ignore_link?(s['loc'])
@pages_to_visit[s['loc']] = [url, 'sitemap']
end
end
def setup
Backend::Test.start(wait_for_scheduler: true)
end
def test_spider_anonymously
visit root_path
@pages_to_visit = { page.current_url => [nil, nil] }
@pages_visited = {}
crawl
ActiveRecord::Base.clear_active_connections!
@pages_visited.keys.length.must_be :>, 800
end
def test_spider_as_admin
login_king(to: root_path)
@pages_to_visit = { page.current_url => [nil, nil] }
@pages_visited = {}
crawl
ActiveRecord::Base.clear_active_connections!
@pages_visited.keys.length.must_be :>, 1200
end
end