Skip to content
Browse files

reorder public methods and refactor a little

  • Loading branch information...
1 parent f98f017 commit cf82a61be101ee00237aa03f7583fb2f557df3a1 @mislav committed Aug 29, 2010
Showing with 29 additions and 36 deletions.
  1. +3 −3 Rakefile
  2. +26 −33 lib/nibbler.rb
View
6 Rakefile
@@ -1,4 +1,4 @@
-task :default => :spec
+task :default => [:loc, :spec]
desc %(Run specs)
task :spec do
@@ -13,12 +13,12 @@ task :loc do
file.each_line do |line|
case line
when /^class\b/ then counting = true
- when /^\s*(#|\Z)/ then next
+ when /^\s*(#|$)/ then next
when /^end\b/ then break
end
loc += 1 if counting
end
- puts loc
+ puts "#{loc} lines of code"
end
end
View
59 lib/nibbler.rb
@@ -1,54 +1,51 @@
-## A minimalistic, declarative HTML scraper
-
+# A minimalistic, declarative HTML scraper
class Nibbler
attr_reader :doc
- # Accepts string, open file, or Nokogiri-like document
- def initialize(doc)
- @doc = self.class.convert_document(doc)
- initialize_plural_accessors
- end
-
- # Initialize a new scraper and process data
- def self.parse(html)
- new(html).parse
- end
-
- # Specify a new singular scraping rule
+ # Declare a singular scraping rule
def self.element(*args, &block)
selector, name, delegate = parse_rule_declaration(*args, &block)
rules[name] = [selector, delegate]
attr_accessor name
name
end
- # Specify a new plural scraping rule
+ # Declare a plural scraping rule
def self.elements(*args, &block)
name = element(*args, &block)
rules[name] << true
end
- # Let it do its thing!
+ # Process data by creating a new scraper
+ def self.parse(data) new(data).parse end
+
+ # Initialize the parser with raw data or a document
+ def initialize(data)
+ @doc = self.class.convert_document(data)
+ # initialize plural properties
+ self.class.rules.each { |name, (s, k, plural)| send("#{name}=", []) if plural }
+ end
+
+ # Parse the document and save values returned by selectors
def parse
self.class.rules.each do |target, (selector, delegate, plural)|
if plural
- @doc.search(selector).each do |node|
- send(target) << parse_result(node, delegate)
- end
+ send(target).concat @doc.search(selector).map { |i| parse_result(i, delegate) }
else
send("#{target}=", parse_result(@doc.at(selector), delegate))
end
end
self
end
+ # Dump the extracted data into a hash with symbolized keys
def to_hash
converter = lambda { |obj| obj.respond_to?(:to_hash) ? obj.to_hash : obj }
- self.class.rules.keys.inject({}) { |hash, name|
+ self.class.rules.keys.inject({}) do |hash, name|
value = send(name)
hash[name.to_sym] = Array === value ? value.map(&converter) : converter[value]
hash
- }
+ end
end
protected
@@ -66,18 +63,20 @@ def parse_result(node, delegate)
private
+ # Parsing rules declared with `element` or `elements`
def self.rules
@rules ||= {}
end
+ # Make subclasses inherit the parsing rules
def self.inherited(subclass)
subclass.rules.update self.rules
end
- # Rule declaration is in Hash or single argument form:
+ # Rule declaration forms:
#
- # { '//some/selector' => :name, :with => delegate }
- # #=> ['//some/selector', :name, delegate]
+ # { 'selector' => :property, :with => delegate }
+ # #=> ['selector', :property, delegate]
#
# :title
# #=> ['title', :title, nil]
@@ -91,18 +90,12 @@ def self.parse_rule_declaration(*args, &block)
return selector, property, delegate
end
- def initialize_plural_accessors
- self.class.rules.each do |name, (s, k, plural)|
- send("#{name}=", []) if plural
- end
- end
-
+ # Parse data with Nokogiri unless it's already an acceptable document
def self.convert_document(doc)
- unless doc.respond_to?(:at) && doc.respond_to?(:search)
+ if doc.respond_to?(:at) and doc.respond_to?(:search) then doc
+ else
require 'nokogiri' unless defined? ::Nokogiri
Nokogiri doc
- else
- doc
end
end
end

0 comments on commit cf82a61

Please sign in to comment.
Something went wrong with that request. Please try again.