From 2acde9912dc7b7f2d32c2ad665d703e05cf599dc Mon Sep 17 00:00:00 2001 From: Chris Kite Date: Mon, 10 Aug 2009 20:47:25 -0500 Subject: [PATCH] allow user-specified user agent and page request delay --- lib/anemone/anemone.rb | 18 +++++++++++++----- lib/anemone/http.rb | 2 +- lib/anemone/tentacle.rb | 2 ++ spec/anemone_spec.rb | 15 ++++++++++++--- spec/core_spec.rb | 14 ++++++++++++++ spec/spec_helper.rb | 4 +++- 6 files changed, 45 insertions(+), 10 deletions(-) diff --git a/lib/anemone/anemone.rb b/lib/anemone/anemone.rb index b3b784d0..c8e7d429 100644 --- a/lib/anemone/anemone.rb +++ b/lib/anemone/anemone.rb @@ -4,10 +4,7 @@ module Anemone # Version number VERSION = '0.1.1' - - # User-Agent string used for HTTP requests - USER_AGENT = "Anemone/#{self::VERSION}" - + #module-wide options def Anemone.options=(options) @options = options @@ -31,7 +28,18 @@ def Anemone.crawl(urls, options = {}, &block) #by default, don't throw away the page response body after scanning it for links Anemone.options.discard_page_bodies ||= false - + + #by default, identify self as Anemone/VERSION + Anemone.options.user_agent ||= "Anemone/#{self::VERSION}" + + #no delay between requests by default + Anemone.options.delay ||= 0 + + #use a single thread if a delay was requested + if(Anemone.options.delay != 0) + Anemone.options.threads = 1 + end + Core.crawl(urls, &block) end end diff --git a/lib/anemone/http.rb b/lib/anemone/http.rb index 4eb1f350..bc6ab30f 100644 --- a/lib/anemone/http.rb +++ b/lib/anemone/http.rb @@ -31,7 +31,7 @@ def self.get(url) def self.get_response(url) full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}" Net::HTTP.start(url.host, url.port) do |http| - return http.get(full_path, {'User-Agent' => Anemone::USER_AGENT }) + return http.get(full_path, {'User-Agent' => Anemone.options.user_agent }) end end end diff --git a/lib/anemone/tentacle.rb b/lib/anemone/tentacle.rb index c58096e3..4b7372fa 100644 --- a/lib/anemone/tentacle.rb +++ b/lib/anemone/tentacle.rb @@ -24,6 +24,8 @@ def run page = Page.fetch(link) @page_queue.enq(page) + + sleep Anemone.options.delay end end diff --git a/spec/anemone_spec.rb b/spec/anemone_spec.rb index ad6e0def..f1d8949f 100644 --- a/spec/anemone_spec.rb +++ b/spec/anemone_spec.rb @@ -2,9 +2,8 @@ describe Anemone do - it "should have a version and user agent" do + it "should have a version" do Anemone.const_defined?('VERSION').should == true - Anemone.const_defined?('USER_AGENT').should == true end it "should have options" do @@ -12,10 +11,20 @@ end it "should accept options for the crawl" do - Anemone.crawl(SPEC_DOMAIN, :verbose => false, :threads => 2, :discard_page_bodies => true) + Anemone.crawl(SPEC_DOMAIN, :verbose => false, + :threads => 2, + :discard_page_bodies => true, + :user_agent => 'test') Anemone.options.verbose.should == false Anemone.options.threads.should == 2 Anemone.options.discard_page_bodies.should == true + Anemone.options.delay.should == 0 + Anemone.options.user_agent.should == 'test' + end + + it "should use 1 thread if a delay is requested" do + Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2) + Anemone.options.threads.should == 1 end it "should return a Anemone::Core from the crawl, which has a PageHash" do diff --git a/spec/core_spec.rb b/spec/core_spec.rb index 5f29ed77..73bc70d0 100644 --- a/spec/core_spec.rb +++ b/spec/core_spec.rb @@ -110,5 +110,19 @@ module Anemone core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url) end + it "should optionally delay between page requests" do + delay = 0.25 + + pages = [] + pages << FakePage.new('0', :links => '1') + pages << FakePage.new('1') + + start = Time.now + Anemone.crawl(pages[0].url, :delay => delay) + finish = Time.now + + (finish - start).should satisfy {|t| t > delay * 2} + end + end end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 070a7ad2..a5bd0233 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,5 +1,7 @@ -require File.dirname(__FILE__) + '/../lib/anemone' require File.dirname(__FILE__) + '/fakeweb_helper' require 'rubygems' +$:.unshift(File.dirname(__FILE__) + '/../lib/') +require 'anemone' + SPEC_DOMAIN = 'http://www.example.com/'