Skip to content

Commit

Permalink
Merge pull request #35 from DanElbert/master
Browse files Browse the repository at this point in the history
Add ignore_fragments and fix query string handling
  • Loading branch information
oscardelben committed May 14, 2014
2 parents dab9e52 + 33e819d commit 943b283
Show file tree
Hide file tree
Showing 7 changed files with 35 additions and 8 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Rawler will only parse pages with content type 'text/html', but it will check fo
--include <s>: Only include URLs that match a regexp
--iinclude <s>: Only include URLs that match a case insensitive regexp
--local <s>: Restrict to the given URL and below. Equivalent to '--include ^http://mysite.com/*'.
--ignore_fragments: Strips any fragment from parsed links
--version, -v: Print version and exit
--help, -h: Show this message

Expand Down
4 changes: 2 additions & 2 deletions bin/rawler
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Usage:
where [options] are:
EOS

opt :username, "HTTP Basic Username", :type => :string
opt :password, "HTTP Basic Password", :type => :string
opt :wait, "Seconds to wait between requests, may be fractional e.g. '1.5'", :type => :float, :default => 3.0
Expand All @@ -25,6 +25,7 @@ EOS
opt :include, "Only include URLS that match a pattern", :type => :string
opt :iinclude, "Only include URLS that match a case insensitive pattern. Equivalent to '--include ^http://mysite.com/*'.", :type => :string
opt :local, "Restrict to the given URL and below", :type => :boolean, :default => false
opt :ignore_fragments, "Discard fragments when parsing links", :type => :boolean, :default => false
end


Expand All @@ -39,4 +40,3 @@ else
end

Rawler::Base.new(domain, $stdout, opts).validate

1 change: 1 addition & 0 deletions lib/rawler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ module Rawler
mattr_accessor :css
mattr_accessor :include_url_pattern
mattr_accessor :skip_url_pattern
mattr_accessor :ignore_fragments

autoload :Base, File.join(dir, 'rawler', 'base')
autoload :Crawler, File.join(dir, 'rawler', 'crawler')
Expand Down
7 changes: 4 additions & 3 deletions lib/rawler/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ def initialize(url, output, options={})
Rawler.password = options[:password]
Rawler.wait = options[:wait]
Rawler.css = options[:css]

Rawler.ignore_fragments = options[:ignore_fragments]

Rawler.local = options[:local]

Rawler.set_include_pattern(options[:include], false) unless options[:include].nil?
Expand Down Expand Up @@ -54,14 +55,14 @@ def validate_css_links_in_page(page)

def validate_page(page_url, from_url)
if not_yet_parsed?(page_url)
add_status_code(page_url, from_url)
add_status_code(page_url, from_url)
validate_links_in_page(page_url) if same_domain?(page_url)
validate_css_links_in_page(page_url) if same_domain?(page_url) and Rawler.css
end
end

def validate_non_html(page_url, from_url)
if not_yet_parsed?(page_url)
if not_yet_parsed?(page_url)
add_status_code(page_url, from_url)
end
end
Expand Down
12 changes: 9 additions & 3 deletions lib/rawler/crawler.rb
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,16 @@ def get_links(selector)
def absolute_url(path)
path = URI.encode(path.strip, Regexp.new("[^#{URI::PATTERN::UNRESERVED}#{URI::PATTERN::RESERVED}#]"))

if URI.parse(path).scheme
path
uri = URI.parse(path)

if uri.fragment && Rawler.ignore_fragments
uri.fragment = nil
end

if uri.scheme
uri.to_s
else
URI.parse(url).merge(path).to_s
URI.parse(url).merge(uri).to_s
end
rescue URI::InvalidURIError, URI::InvalidComponentError
write("Invalid url: #{path} - Called from: #{url}")
Expand Down
2 changes: 2 additions & 0 deletions lib/rawler/request.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def perform_request(method, url)

path = (uri.path.size == 0) ? "/" : uri.path

path += "?#{uri.query}" if uri.query

request = Net::HTTP::Get.new(path)
request.basic_auth(Rawler.username, Rawler.password)
http.request(request)
Expand Down
16 changes: 16 additions & 0 deletions spec/lib/rawler/crawler_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,22 @@
crawler.links.should == ['http://example.com/foo#bar']
end

context "when ignore_fragments is on" do

before(:each) do
Rawler.ignore_fragments = true
end

after(:each) do
Rawler.ignore_fragments = false
end

it 'should strip the fragments' do
crawler.links.should == ['http://example.com/foo']
end

end

end

context "urls with unicode characters" do
Expand Down

0 comments on commit 943b283

Please sign in to comment.