diff --git a/scripts/lib/analyzer.rb b/scripts/lib/analyzer.rb index a860bd4..b32cf38 100644 --- a/scripts/lib/analyzer.rb +++ b/scripts/lib/analyzer.rb @@ -66,7 +66,17 @@ def analyze(uri, content, params) link = uri.merge(link) if link.relative? if link.scheme == 'http' if uri.host == link.host # Stay in same site: By design, relax/remove this at your own risk - links << [link, uri] unless links.include? link + if params[:scope_uri] + root_path = params[:url].path.match(/.*\//) + link_path = link.path.match(/.*\//) + if root_path and link_path and link_path[0].include?(root_path[0]) + links << [link, uri] unless links.include? link + else + log.warn("Skipping as path outside root scope #{link}") unless params[:quiet] + end + else + links << [link, uri] unless links.include? link + end else log.warn("Skipping as host differs #{link}") unless params[:quiet] end @@ -78,8 +88,8 @@ def analyze(uri, content, params) end end crawled_title = find_title(doc, params[:title_strip]) - rescue - log.warn("Error extracting links for #{uri}") + #rescue + #log.warn("Error extracting links for #{uri}") ensure content.rewind end diff --git a/scripts/lib/cache.rb b/scripts/lib/cache.rb index 5e6a515..69c81f0 100644 --- a/scripts/lib/cache.rb +++ b/scripts/lib/cache.rb @@ -15,7 +15,7 @@ def initialize(site_uri, at, db = SQLite3::Database.new(":memory:")) end def Cache.from_path(site_uri, at, path, delete_old = false) - filename = File.join(path, site_uri.host.gsub(/\./, '_')) + filename = File.join(path, (site_uri.host + "/" + site_uri.path).gsub(/\W/, '_')) File.delete(filename) if delete_old and File.exists?(filename) db = SQLite3::Database.new(filename) Cache.new(site_uri, at, db) diff --git a/scripts/lib/options.rb b/scripts/lib/options.rb index 53ed200..28013e5 100644 --- a/scripts/lib/options.rb +++ b/scripts/lib/options.rb @@ -39,6 +39,7 @@ def CrawlerOptions.get_options(args) opts.on("-w N", "--wait N", Integer, "Wait N seconds between each fetch") {|n| options[:wait] = n } opts.on("-e", "--index", "Crawl AND index the content") { |v| options[:index] = v } opts.on("-q", "--quiet", "Reduce log messages to informational only") { |q| options[:quiet] = q } + opts.on("--scope-to-root", "Only index if uri matches, completely, the initial root url path") { |s| options[:scope_uri] = true } opts.on("--local-cache", "Enable local caching of data (off by default)") {|l| options[:cache_enabled] = l } opts.on("-h N", "--threads N", Integer, "Set number of crawler threads to use") {|t| options[:threads] = t} opts.on("--yuk", "Horrible hack to fix poor CDATA termination, specific to a site - fix") {|y| options[:yuk] = y } diff --git a/scripts/test/test_analyzer.rb b/scripts/test/test_analyzer.rb index e931003..dc09b64 100644 --- a/scripts/test/test_analyzer.rb +++ b/scripts/test/test_analyzer.rb @@ -30,7 +30,7 @@ def test_analyze_only_text_html assert_equal [to_href("http://www.foo.com/hello.html")], content.readlines end end - + def test_with_link p = { :log => @log, :user_agent => "007", :from => "mars" } add_expect_uri("http://www.foo.com/", to_href("http://www.foo.com/hello.html")) @@ -40,7 +40,7 @@ def test_with_link assert_equal [to_href("http://www.foo.com/hello.html")], content.readlines end end - + def test_no_cross_site @log.expects(:warn).once.with('Skipping as host differs http://www.bar.com/hello.html') p = { :log => @log, :user_agent => "007", :from => "mars" } @@ -51,7 +51,7 @@ def test_no_cross_site assert_equal [to_href("http://www.bar.com/hello.html")], content.readlines end end - + def test_follow_only_http_links @log.expects(:warn).once.with('Skipping as non-http file://hello.html') p = { :log => @log, :user_agent => "007", :from => "mars" } @@ -63,6 +63,27 @@ def test_follow_only_http_links end end + def test_stay_in_root_uri_scope + @log.expects(:warn).once.with('Skipping as path outside root scope http://www.foo.com/a/c/') + root = "http://www.foo.com/a/b/index.htm" + uri = "http://www.foo.com/a/b/foo.txt" + p = { :url => URI.parse(root), :log => @log, :user_agent => "007", :from => "mars", :scope_uri => true } + add_expect_uri(uri, to_href("http://www.foo.com/a/c/")) + Analyzer.new().extract_links(URI.parse(uri), "peter pan", nil, p) do |crawled_title, new_links, content| + assert_equal nil, crawled_title + assert_equal([], new_links) + end + + @log.expects(:warn).once.with('Skipping as path outside root scope http://www.foo.com/a/c/bar.txt') + uri = "http://www.foo.com/a/b/foo.txt" + p = { :url => URI.parse(root), :log => @log, :user_agent => "007", :from => "mars", :scope_uri => true } + add_expect_uri(uri, to_href("../c/bar.txt")) + Analyzer.new().extract_links(URI.parse(uri), "peter pan", nil, p) do |crawled_title, new_links, content| + assert_equal nil, crawled_title + assert_equal([], new_links) + end + end + def test_bad_uri @log.expects(:warn).once.with('Invalid link in page http://www.foo.com/ : bad URI(is not URI?): :bar:foo') p = { :log => @log, :user_agent => "007", :from => "mars" } @@ -73,7 +94,7 @@ def test_bad_uri assert_equal [to_href(":bar:foo")], content.readlines end end - + def test_no_links p = { :log => @log, :user_agent => "007", :from => "mars" } add_expect_uri("http://www.foo.com/") @@ -83,7 +104,7 @@ def test_no_links assert_equal ['Hello world'], content.readlines end end - + def test_head_title body = '