-
Notifications
You must be signed in to change notification settings - Fork 434
/
spider_test.rb
143 lines (129 loc) · 5.01 KB
/
spider_test.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
require File.expand_path(File.dirname(__FILE__) + "/..") + "/test_helper"
require 'benchmark'
require 'nokogiri'
require 'home_controller'
class SpiderTest < ActionDispatch::IntegrationTest
def getlinks(baseuri, body)
# skip some uninteresting projects
return if baseuri =~ %r{project=home%3Afred}
return if baseuri =~ %r{project=home%3Acoolo}
return if baseuri =~ %r{project=deleted}
baseuri = URI.parse(baseuri)
body.traverse do |tag|
next unless tag.element?
next unless tag.name == 'a'
next if tag.attributes['data-remote']
next if tag.attributes['data-method']
link = tag.attributes['href']
begin
link = baseuri.merge(link)
rescue ArgumentError
# if merge does not like it, it's not a valid link
next
end
link.fragment = nil
link.normalize!
next unless link.host == baseuri.host
next unless link.port == baseuri.port
link = link.to_s
next if link =~ %r{/mini-profiler-resources}
# that link is just a top ref
next if link.end_with? "/package/rdiff"
# admin can see even the hidden
next if link.end_with? "/package/show/HiddenRemoteInstance"
next if link.end_with? "/project/show/HiddenRemoteInstance"
next if link.end_with? "/project/show/RemoteInstance"
unless @pages_visited.has_key? link
@pages_to_visit[link] ||= [baseuri.to_s, tag.content]
end
end
end
def raiseit(message, url)
# known issues
return if url =~ %r{/package/binary/BinaryprotectedProject/.*}
return if url =~ %r{/package/statistics/BinaryprotectedProject/.*}
return if url.end_with? "/package/binary/SourceprotectedProject/pack?arch=i586&filename=package-1.0-1.src.rpm&repository=repo"
return if url =~ %r{/package/revisions/SourceprotectedProject.*}
return if url.end_with? "/package/show/kde4/kdelibs?rev=1"
return if url.end_with? "/package/show/SourceprotectedProject/target"
return if url.end_with? "/package/users/SourceprotectedProject/pack"
return if url.end_with? "/package/view_file/BaseDistro:Update/pack2?file=my_file&rev=1"
return if url.end_with? "/package/view_file/Devel:BaseDistro:Update/pack2?file=my_file&rev=1"
return if url.end_with? "/package/view_file/Devel:BaseDistro:Update/pack3?file=my_file&rev=1"
return if url.end_with? "/package/view_file/LocalProject/remotepackage?file=my_file&rev=1"
return if url.end_with? "/package/view_file/BaseDistro2.0:LinkedUpdateProject/pack2.linked?file=myfile&rev=1"
return if url.end_with? "/package/view_file/BaseDistro2.0/pack2.linked?file=myfile&rev=1"
return if url.end_with? "/package/view_file/BaseDistro2.0:LinkedUpdateProject/pack2.linked?file=package.spec&rev=1"
return if url.end_with? "/package/view_file/BaseDistro2.0/pack2.linked?file=package.spec&rev=1"
return if url.end_with? "/project/edit/RemoteInstance"
return if url.end_with? "/project/meta/HiddenRemoteInstance"
return if url.end_with? "/project/show/HiddenRemoteInstance"
return if url.end_with? "/project/edit/HiddenRemoteInstance"
$stderr.puts "Found #{message} on #{url}, crawling path"
indent = ' '
while @pages_visited.has_key? url
url, text = @pages_visited[url]
break if url.blank?
$stderr.puts "#{indent}#{url} ('#{text}')"
indent += ' '
end
raise "Found #{message}"
end
def crawl
while @pages_to_visit.length > 0
theone = @pages_to_visit.keys.sort.first
@pages_visited[theone] = @pages_to_visit[theone]
@pages_to_visit.delete theone
begin
puts "V #{theone} #{@pages_to_visit.length}/#{@pages_visited.keys.length+@pages_to_visit.length}"
page.visit(theone)
page.first(:id, 'header-logo')
rescue Timeout::Error
next
rescue ActionController::RoutingError
raiseit("routing error", theone)
return
end
body = nil
begin
body = Nokogiri::HTML::Document.parse(page.source).root
rescue Nokogiri::XML::SyntaxError
#puts "HARDCORE!! #{theone}"
end
next unless body
flashes = body.css("div#flash-messages div.ui-state-error")
if !flashes.empty?
raiseit("flash alert #{flashes.first.content.strip}", theone)
end
body.css('h1').each do |h|
if h.content == 'Internal Server Error'
raiseit("Internal Server Error", theone)
end
end
body.css('h2').each do |h|
if h.content == 'XML errors'
raiseit("XML errors", theone)
end
end
body.css("#exception-error").each do |e|
raiseit("error '#{e.content}'", theone)
end
getlinks(theone, body)
end
end
test "spider anonymously" do
return unless ENV['RUN_SPIDER']
visit "/"
@pages_to_visit = { page.current_url => [nil, nil] }
@pages_visited = Hash.new
crawl
end
test "spider as admin" do
return unless ENV['RUN_SPIDER']
login_king
visit "/"
@pages_to_visit = { page.current_url => [nil, nil] }
@pages_visited = Hash.new
crawl
end
end