Browse files

allow user-specified user agent and page request delay

  • Loading branch information...
1 parent ba35e18 commit 2acde9912dc7b7f2d32c2ad665d703e05cf599dc @chriskite chriskite committed Aug 10, 2009
Showing with 45 additions and 10 deletions.
  1. +13 −5 lib/anemone/anemone.rb
  2. +1 −1 lib/anemone/http.rb
  3. +2 −0 lib/anemone/tentacle.rb
  4. +12 −3 spec/anemone_spec.rb
  5. +14 −0 spec/core_spec.rb
  6. +3 −1 spec/spec_helper.rb
View
18 lib/anemone/anemone.rb
@@ -4,10 +4,7 @@
module Anemone
# Version number
VERSION = '0.1.1'
-
- # User-Agent string used for HTTP requests
- USER_AGENT = "Anemone/#{self::VERSION}"
-
+
#module-wide options
def Anemone.options=(options)
@options = options
@@ -31,7 +28,18 @@ def Anemone.crawl(urls, options = {}, &block)
#by default, don't throw away the page response body after scanning it for links
Anemone.options.discard_page_bodies ||= false
-
+
+ #by default, identify self as Anemone/VERSION
+ Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"
+
+ #no delay between requests by default
+ Anemone.options.delay ||= 0
+
+ #use a single thread if a delay was requested
+ if(Anemone.options.delay != 0)
+ Anemone.options.threads = 1
+ end
+
Core.crawl(urls, &block)
end
end
View
2 lib/anemone/http.rb
@@ -31,7 +31,7 @@ def self.get(url)
def self.get_response(url)
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
Net::HTTP.start(url.host, url.port) do |http|
- return http.get(full_path, {'User-Agent' => Anemone::USER_AGENT })
+ return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
end
end
end
View
2 lib/anemone/tentacle.rb
@@ -24,6 +24,8 @@ def run
page = Page.fetch(link)
@page_queue.enq(page)
+
+ sleep Anemone.options.delay
end
end
View
15 spec/anemone_spec.rb
@@ -2,20 +2,29 @@
describe Anemone do
- it "should have a version and user agent" do
+ it "should have a version" do
Anemone.const_defined?('VERSION').should == true
- Anemone.const_defined?('USER_AGENT').should == true
end
it "should have options" do
Anemone.should respond_to(:options)
end
it "should accept options for the crawl" do
- Anemone.crawl(SPEC_DOMAIN, :verbose => false, :threads => 2, :discard_page_bodies => true)
+ Anemone.crawl(SPEC_DOMAIN, :verbose => false,
+ :threads => 2,
+ :discard_page_bodies => true,
+ :user_agent => 'test')
Anemone.options.verbose.should == false
Anemone.options.threads.should == 2
Anemone.options.discard_page_bodies.should == true
+ Anemone.options.delay.should == 0
+ Anemone.options.user_agent.should == 'test'
+ end
+
+ it "should use 1 thread if a delay is requested" do
+ Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
+ Anemone.options.threads.should == 1
end
it "should return a Anemone::Core from the crawl, which has a PageHash" do
View
14 spec/core_spec.rb
@@ -110,5 +110,19 @@ module Anemone
core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
end
+ it "should optionally delay between page requests" do
+ delay = 0.25
+
+ pages = []
+ pages << FakePage.new('0', :links => '1')
+ pages << FakePage.new('1')
+
+ start = Time.now
+ Anemone.crawl(pages[0].url, :delay => delay)
+ finish = Time.now
+
+ (finish - start).should satisfy {|t| t > delay * 2}
+ end
+
end
end
View
4 spec/spec_helper.rb
@@ -1,5 +1,7 @@
-require File.dirname(__FILE__) + '/../lib/anemone'
require File.dirname(__FILE__) + '/fakeweb_helper'
require 'rubygems'
+$:.unshift(File.dirname(__FILE__) + '/../lib/')
+require 'anemone'
+
SPEC_DOMAIN = 'http://www.example.com/'

0 comments on commit 2acde99

Please sign in to comment.