Skip to content

Commit

Permalink
allow user-specified user agent and page request delay
Browse files Browse the repository at this point in the history
  • Loading branch information
chriskite committed Aug 11, 2009
1 parent ba35e18 commit 2acde99
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 10 deletions.
18 changes: 13 additions & 5 deletions lib/anemone/anemone.rb
Expand Up @@ -4,10 +4,7 @@
module Anemone
# Version number
VERSION = '0.1.1'

# User-Agent string used for HTTP requests
USER_AGENT = "Anemone/#{self::VERSION}"


#module-wide options
def Anemone.options=(options)
@options = options
Expand All @@ -31,7 +28,18 @@ def Anemone.crawl(urls, options = {}, &block)

#by default, don't throw away the page response body after scanning it for links
Anemone.options.discard_page_bodies ||= false


#by default, identify self as Anemone/VERSION
Anemone.options.user_agent ||= "Anemone/#{self::VERSION}"

#no delay between requests by default
Anemone.options.delay ||= 0

#use a single thread if a delay was requested
if(Anemone.options.delay != 0)
Anemone.options.threads = 1
end

Core.crawl(urls, &block)
end
end
2 changes: 1 addition & 1 deletion lib/anemone/http.rb
Expand Up @@ -31,7 +31,7 @@ def self.get(url)
def self.get_response(url)
full_path = url.query.nil? ? url.path : "#{url.path}?#{url.query}"
Net::HTTP.start(url.host, url.port) do |http|
return http.get(full_path, {'User-Agent' => Anemone::USER_AGENT })
return http.get(full_path, {'User-Agent' => Anemone.options.user_agent })
end
end
end
Expand Down
2 changes: 2 additions & 0 deletions lib/anemone/tentacle.rb
Expand Up @@ -24,6 +24,8 @@ def run
page = Page.fetch(link)

@page_queue.enq(page)

sleep Anemone.options.delay
end
end

Expand Down
15 changes: 12 additions & 3 deletions spec/anemone_spec.rb
Expand Up @@ -2,20 +2,29 @@

describe Anemone do

it "should have a version and user agent" do
it "should have a version" do
Anemone.const_defined?('VERSION').should == true
Anemone.const_defined?('USER_AGENT').should == true
end

it "should have options" do
Anemone.should respond_to(:options)
end

it "should accept options for the crawl" do
Anemone.crawl(SPEC_DOMAIN, :verbose => false, :threads => 2, :discard_page_bodies => true)
Anemone.crawl(SPEC_DOMAIN, :verbose => false,
:threads => 2,
:discard_page_bodies => true,
:user_agent => 'test')
Anemone.options.verbose.should == false
Anemone.options.threads.should == 2
Anemone.options.discard_page_bodies.should == true
Anemone.options.delay.should == 0
Anemone.options.user_agent.should == 'test'
end

it "should use 1 thread if a delay is requested" do
Anemone.crawl(SPEC_DOMAIN, :delay => 0.01, :threads => 2)
Anemone.options.threads.should == 1
end

it "should return a Anemone::Core from the crawl, which has a PageHash" do
Expand Down
14 changes: 14 additions & 0 deletions spec/core_spec.rb
Expand Up @@ -110,5 +110,19 @@ module Anemone
core.pages.keys.map{|k| k.to_s}.should_not include(pages[1].url)
end

it "should optionally delay between page requests" do
delay = 0.25

pages = []
pages << FakePage.new('0', :links => '1')
pages << FakePage.new('1')

start = Time.now
Anemone.crawl(pages[0].url, :delay => delay)
finish = Time.now

(finish - start).should satisfy {|t| t > delay * 2}
end

end
end
4 changes: 3 additions & 1 deletion spec/spec_helper.rb
@@ -1,5 +1,7 @@
require File.dirname(__FILE__) + '/../lib/anemone'
require File.dirname(__FILE__) + '/fakeweb_helper'
require 'rubygems'

$:.unshift(File.dirname(__FILE__) + '/../lib/')
require 'anemone'

SPEC_DOMAIN = 'http://www.example.com/'

0 comments on commit 2acde99

Please sign in to comment.