diff --git a/scripts/lib/analyzer.rb b/scripts/lib/analyzer.rb index a860bd4..b32cf38 100644 --- a/scripts/lib/analyzer.rb +++ b/scripts/lib/analyzer.rb @@ -66,7 +66,17 @@ def analyze(uri, content, params) link = uri.merge(link) if link.relative? if link.scheme == 'http' if uri.host == link.host # Stay in same site: By design, relax/remove this at your own risk - links << [link, uri] unless links.include? link + if params[:scope_uri] + root_path = params[:url].path.match(/.*\//) + link_path = link.path.match(/.*\//) + if root_path and link_path and link_path[0].include?(root_path[0]) + links << [link, uri] unless links.include? link + else + log.warn("Skipping as path outside root scope #{link}") unless params[:quiet] + end + else + links << [link, uri] unless links.include? link + end else log.warn("Skipping as host differs #{link}") unless params[:quiet] end @@ -78,8 +88,8 @@ def analyze(uri, content, params) end end crawled_title = find_title(doc, params[:title_strip]) - rescue - log.warn("Error extracting links for #{uri}") + #rescue + #log.warn("Error extracting links for #{uri}") ensure content.rewind end diff --git a/scripts/lib/cache.rb b/scripts/lib/cache.rb index 5e6a515..69c81f0 100644 --- a/scripts/lib/cache.rb +++ b/scripts/lib/cache.rb @@ -15,7 +15,7 @@ def initialize(site_uri, at, db = SQLite3::Database.new(":memory:")) end def Cache.from_path(site_uri, at, path, delete_old = false) - filename = File.join(path, site_uri.host.gsub(/\./, '_')) + filename = File.join(path, (site_uri.host + "/" + site_uri.path).gsub(/\W/, '_')) File.delete(filename) if delete_old and File.exists?(filename) db = SQLite3::Database.new(filename) Cache.new(site_uri, at, db) diff --git a/scripts/lib/options.rb b/scripts/lib/options.rb index 53ed200..28013e5 100644 --- a/scripts/lib/options.rb +++ b/scripts/lib/options.rb @@ -39,6 +39,7 @@ def CrawlerOptions.get_options(args) opts.on("-w N", "--wait N", Integer, "Wait N seconds between each fetch") {|n| options[:wait] = n } opts.on("-e", "--index", "Crawl AND index the content") { |v| options[:index] = v } opts.on("-q", "--quiet", "Reduce log messages to informational only") { |q| options[:quiet] = q } + opts.on("--scope-to-root", "Only index if uri matches, completely, the initial root url path") { |s| options[:scope_uri] = true } opts.on("--local-cache", "Enable local caching of data (off by default)") {|l| options[:cache_enabled] = l } opts.on("-h N", "--threads N", Integer, "Set number of crawler threads to use") {|t| options[:threads] = t} opts.on("--yuk", "Horrible hack to fix poor CDATA termination, specific to a site - fix") {|y| options[:yuk] = y } diff --git a/scripts/test/test_analyzer.rb b/scripts/test/test_analyzer.rb index e931003..dc09b64 100644 --- a/scripts/test/test_analyzer.rb +++ b/scripts/test/test_analyzer.rb @@ -30,7 +30,7 @@ def test_analyze_only_text_html assert_equal [to_href("http://www.foo.com/hello.html")], content.readlines end end - + def test_with_link p = { :log => @log, :user_agent => "007", :from => "mars" } add_expect_uri("http://www.foo.com/", to_href("http://www.foo.com/hello.html")) @@ -40,7 +40,7 @@ def test_with_link assert_equal [to_href("http://www.foo.com/hello.html")], content.readlines end end - + def test_no_cross_site @log.expects(:warn).once.with('Skipping as host differs http://www.bar.com/hello.html') p = { :log => @log, :user_agent => "007", :from => "mars" } @@ -51,7 +51,7 @@ def test_no_cross_site assert_equal [to_href("http://www.bar.com/hello.html")], content.readlines end end - + def test_follow_only_http_links @log.expects(:warn).once.with('Skipping as non-http file://hello.html') p = { :log => @log, :user_agent => "007", :from => "mars" } @@ -63,6 +63,27 @@ def test_follow_only_http_links end end + def test_stay_in_root_uri_scope + @log.expects(:warn).once.with('Skipping as path outside root scope http://www.foo.com/a/c/') + root = "http://www.foo.com/a/b/index.htm" + uri = "http://www.foo.com/a/b/foo.txt" + p = { :url => URI.parse(root), :log => @log, :user_agent => "007", :from => "mars", :scope_uri => true } + add_expect_uri(uri, to_href("http://www.foo.com/a/c/")) + Analyzer.new().extract_links(URI.parse(uri), "peter pan", nil, p) do |crawled_title, new_links, content| + assert_equal nil, crawled_title + assert_equal([], new_links) + end + + @log.expects(:warn).once.with('Skipping as path outside root scope http://www.foo.com/a/c/bar.txt') + uri = "http://www.foo.com/a/b/foo.txt" + p = { :url => URI.parse(root), :log => @log, :user_agent => "007", :from => "mars", :scope_uri => true } + add_expect_uri(uri, to_href("../c/bar.txt")) + Analyzer.new().extract_links(URI.parse(uri), "peter pan", nil, p) do |crawled_title, new_links, content| + assert_equal nil, crawled_title + assert_equal([], new_links) + end + end + def test_bad_uri @log.expects(:warn).once.with('Invalid link in page http://www.foo.com/ : bad URI(is not URI?): :bar:foo') p = { :log => @log, :user_agent => "007", :from => "mars" } @@ -73,7 +94,7 @@ def test_bad_uri assert_equal [to_href(":bar:foo")], content.readlines end end - + def test_no_links p = { :log => @log, :user_agent => "007", :from => "mars" } add_expect_uri("http://www.foo.com/") @@ -83,7 +104,7 @@ def test_no_links assert_equal ['Hello world'], content.readlines end end - + def test_head_title body = 'Fish Head!' p = { :log => @log, :user_agent => "007", :from => "mars" } @@ -91,7 +112,7 @@ def test_head_title Analyzer.new().extract_links(URI.parse("http://www.foo.com/"), "eel man", nil, p) do |crawled_title, new_links, content| assert_equal nil, crawled_title end - + # Only store if the strip worked p.merge!({ :title_strip => 'Tree!'}) add_expect_uri("http://www.foo.com/", body) @@ -105,28 +126,28 @@ def test_head_title assert_equal 'Fish', crawled_title end end - + def test_ideas_title body = '
Womble
' - + p = { :log => @log, :user_agent => "007", :from => "mars" } add_expect_uri("http://www.foo.com/", body) - + Analyzer.new().extract_links(URI.parse("http://www.foo.com/"), "eel man", nil, p) do |crawled_title, new_links, content| assert_equal 'Womble', crawled_title end - + add_expect_uri("http://www.foo.com/", body + body) Analyzer.new().extract_links(URI.parse("http://www.foo.com/"), "fish man", nil, p) do |crawled_title, new_links, content| assert_equal nil, crawled_title end end - + def test_blog_title body = '

Womble

' p = { :log => @log, :user_agent => "007", :from => "mars" } add_expect_uri("http://www.foo.com/", body) - + Analyzer.new().extract_links(URI.parse("http://www.foo.com/"), "squid man", nil, p) do |crawled_title, new_links, content| assert_equal 'Womble', crawled_title end @@ -136,7 +157,7 @@ def test_blog_title assert_equal nil, crawled_title end end - + def test_handles_not_modified_304 @log.expects(:info).twice.with('Content hasn\'t changed since last crawl http://www.foo.com/bar.html') p = { :log => @log, :user_agent => "007", :from => "mars" } @@ -152,12 +173,12 @@ def test_handles_not_modified_304 def test_last_crawled_at p = { :log => @log, :user_agent => "007", :from => "mars" } - + last_crawled_at = Time.parse("2010-01-01") uri = URI.parse("http://www.foo.com/bar.html") uri.expects(:open).with({'From' => 'mars', 'User-Agent' => '007', 'If-Modified-Since' => last_crawled_at.to_s, 'Referer' => 'peter pan'}) uri.expects(:open).with({'From' => 'mars', 'User-Agent' => '007', 'Referer' => 'peter pan'}) - + analyzer = Analyzer.new() analyzer.expects(:analyze).twice.with(uri, nil, p) analyzer.extract_links(uri, "peter pan", nil, p) {|content| assert_equal [nil, nil, nil], content } diff --git a/scripts/test/test_cache.rb b/scripts/test/test_cache.rb index feb27da..84612e6 100644 --- a/scripts/test/test_cache.rb +++ b/scripts/test/test_cache.rb @@ -43,12 +43,12 @@ def test_from_path at = Time.parse("2001-01-01") path = "~/home/foo/" - SQLite3::Database.expects(:new).with(File.join(path, 'www_rat_com')).returns("not a real db") + SQLite3::Database.expects(:new).with(File.join(path, 'www_rat_com__foo_html')).returns("not a real db") Cache.expects(:new).with(uri, at, "not a real db").returns(nil) Cache.from_path(uri, at, path) path = "~/home/bar" - SQLite3::Database.expects(:new).with(File.join(path, 'www_rat_com')).returns("not a real db") + SQLite3::Database.expects(:new).with(File.join(path, 'www_rat_com__foo_html')).returns("not a real db") Cache.expects(:new).with(uri, at, "not a real db").returns(nil) Cache.from_path(uri, at, path) end diff --git a/scripts/test/test_options.rb b/scripts/test/test_options.rb index ebff059..66b6bfa 100644 --- a/scripts/test/test_options.rb +++ b/scripts/test/test_options.rb @@ -50,6 +50,7 @@ def test_optional_options test_opt("-h", "--threads", "3", :threads, 3) test_opt("--yuk", "--yuk", true, :yuk) test_opt(nil, "--local-cache", true, :cache_enabled) + test_opt(nil, "--scope-to-root", true, :scope_uri) end def test_opt(short, long, value, option, converted_value = nil)