Skip to content

Commit

Permalink
improved crawler output
Browse files Browse the repository at this point in the history
  • Loading branch information
matugm committed Apr 2, 2012
1 parent 52760af commit ed98576
Showing 1 changed file with 18 additions and 17 deletions.
35 changes: 18 additions & 17 deletions lib/crawl.rb
Expand Up @@ -93,33 +93,34 @@ def normalize
@abs_links.map { |link| link.sub('http://','') } @abs_links.map { |link| link.sub('http://','') }
end end


def print_output(*string) def puts_file(string)
string = string[0] || ""
puts string puts string
@ofile.puts string if @ofile @ofile.puts string if @ofile
end end


def print_link(title,data)
puts_file title

if data == []
puts_file 'nothing found.'
else
puts_file data
end

puts_file ''
end

def print_links(ofile) def print_links(ofile)
@ofile = ofile @ofile = ofile
@abs_links = normalize @abs_links = normalize
final_links = @abs_links + to_absolute(@rel_links) final_links = @abs_links + to_absolute(@rel_links)
final_links = final_links.sort.uniq { |link| link[/.*\?\w+/] } # Conseguir links con parametros unicos final_links = final_links.sort.uniq { |link| link[/.*\?\w+/] } # Conseguir links con parametros unicos


print_output "---- External links" print_link "[External links]", @ext_links.sort.uniq
print_output @ext_links.sort.uniq print_link "[Absolute links]", @abs_links.sort.uniq { |link| link[/.*#\w+/] }
print_output print_link "[Relative links]", @rel_links.sort.uniq { |link| link[/(?:\/\w+)+/] }
print_output "---- Absolute links" print_link "[E-mail accounts] (:mailto)", @mail_links.sort_by { |s| [ s[/@.*/], s[/.*@/] ] }.uniq.map { |m| m.sub('mailto:','') }
print_output @abs_links.sort.uniq { |link| link[/.*#\w+/] } print_link "[Parametized queries]", final_links.grep(/\?/)
print_output
print_output "---- Relative links"
print_output @rel_links.sort.uniq { |link| link[/(?:\/\w+)+/] }
print_output
print_output "---- E-mail accounts (:mailto)"
print_output @mail_links.sort_by { |s| [ s[/@.*/], s[/.*@/] ] }.uniq.map { |m| m.sub('mailto:','') }
print_output
print_output "**** Parametized queries"
print_output final_links.grep(/\?/)
print_output
end end


end end

0 comments on commit ed98576

Please sign in to comment.