Permalink
Browse files

Wrote a benchmark to run against 180 sample feeds. Fixed a bunch of b…

…ugs in determining which parser to use based on those 180 feeds. Need to change the logic for what happens when it doesn't know how to parse something.
  • Loading branch information...
1 parent ed90bf6 commit 65b63d0f4ee632682ae400b6ccfb1e61d7d2dedd @pauldix pauldix committed Jan 31, 2009
View
@@ -11,7 +11,7 @@ class Atom
elements :entry, :as => :entries, :class => AtomEntry
def self.able_to_parse?(xml)
- xml =~ /Atom/
+ xml =~ /(Atom)|(#{Regexp.escape("http://purl.org/atom")})/
end
end
end
@@ -11,5 +11,6 @@ class AtomEntry
element :content
element :summary
element :published
+ element :created, :as => :published
end
end
View
@@ -62,12 +62,18 @@ def self.fetch_and_parse(urls, options = {})
curl.headers["If-None-Match"] = options[:if_none_match] if options.has_key?(:if_none_match)
curl.follow_location = true
curl.on_success do |c|
- feed = Feed.parse(c.body_str)
- feed.feed_url ||= c.last_effective_url
- feed.etag = etag_from_header(c.header_str)
- feed.last_modified = last_modified_from_header(c.header_str)
- responses[url] = feed
- options[:on_success].call(url, feed) if options.has_key?(:on_success)
+ xml = c.body_str
+ klass = determine_feed_parser_for_xml(xml)
+ if klass
+ feed = klass.parse(xml)
+ feed.feed_url ||= c.last_effective_url
+ feed.etag = etag_from_header(c.header_str)
+ feed.last_modified = last_modified_from_header(c.header_str)
+ responses[url] = feed
+ options[:on_success].call(url, feed) if options.has_key?(:on_success)
+ else
+ puts "Error determining parser for #{url} - #{c.last_effective_url}"
+ end
end
curl.on_failure do |c|
responses[url] = c.response_code
View
@@ -12,7 +12,7 @@ class RDF
attr_accessor :feed_url
def self.able_to_parse?(xml)
- xml =~ /rdf\:RDF/ || false
+ xml =~ /(rdf\:RDF)|(#{Regexp.escape("http://purl.org/rss/1.0")})|(rss version\=\"0\.9.?\")/ || false
end
end
end
View
@@ -12,7 +12,7 @@ class RSS
attr_accessor :feed_url
def self.able_to_parse?(xml)
- xml =~ /rss version\=\"2\.0\"/
+ xml =~ /rss.*version\=\"2\.0\"/
end
end
end
@@ -0,0 +1,37 @@
+require File.dirname(__FILE__) + '/../../lib/feedzirra.rb'
+require 'rfeedparser'
+require 'feed-normalizer'
+require 'open-uri'
+
+require 'benchmark'
+include Benchmark
+
+iterations = 10
+urls = File.readlines(File.dirname(__FILE__) + "/../sample_feeds/successful_feed_urls.txt")
+puts "benchmarks on #{urls.size} feeds"
+puts "************************************"
+benchmark do |t|
+ t.report("feedzirra") do
+ iterations.times do
+ Feedzirra::Feed.fetch_and_parse(urls, :on_success => lambda { |url, feed| $stdout.print '.'; $stdout.flush })
+ end
+ end
+
+ t.report("rfeedparser") do
+ iterations.times do
+ urls.each do |url|
+ feed = FeedParser.parse(url)
+ $stdout.print '.'
+ $stdout.flush
+ end
+ end
+ end
+
+ t.report("feed-normalizer") do
+ urls.each do |url|
+ feed = FeedNormalizer::FeedNormalizer.parse(open(url), :force_parser => FeedNormalizer::SimpleRssParser)
+ $stdout.print '.'
+ $stdout.flush
+ end
+ end
+end
@@ -0,0 +1,46 @@
+require File.dirname(__FILE__) + '/../../lib/feedzirra.rb'
+require 'rfeedparser'
+require 'feed-normalizer'
+
+require 'benchmark'
+include Benchmark
+
+iterations = 50
+xml = File.read(File.dirname(__FILE__) + '/../sample_feeds/PaulDixExplainsNothing.xml')
+
+benchmark do |t|
+ t.report("feedzirra") do
+ iterations.times do
+ f = Feedzirra::Feed.parse(xml)
+ title = f.title
+ first_title = f.entries.first.title
+ first_author = f.entries.first.author
+ first_url = f.entries.first.url
+ end
+ end
+
+ t.report("rfeedparser") do
+ iterations.times do
+ f = FeedParser.parse(xml)
+ title = f.title
+ first_title = f.entries.first.title
+ first_author = f.entries.first.author
+ first_url = f.entries.first.url
+ end
+ end
+
+ t.report("feed-normalizer") do
+ iterations.times do
+ # have to use the :force option to make feed-normalizer parse an atom feed
+ f = FeedNormalizer::FeedNormalizer.parse(xml, :force_parser => FeedNormalizer::SimpleRssParser)
+ # title = f.title
+ # first_title = f.entries.first.title
+ # first_author = f.entries.first.author
+ # first_url = f.entries.first.url
+ # puts title
+ # puts first_title
+ # puts first_author
+ # puts first_url
+ end
+ end
+end
@@ -29,6 +29,13 @@
feed.title.should == "Paul Dix Explains Nothing"
feed.entries.size.should == 5
end
+
+ it "should parse an feedburner rss feed" do
+ feed = Feedzirra::Feed.parse(sample_rss_feed_burner_feed)
+ feed.class.should == Feedzirra::RDF
+ feed.title.should == "Sam Harris: Author, Philosopher, Essayist, Atheist"
+ feed.entries.size.should == 10
+ end
end
describe "#determine_feed_parser_for_xml" do
@@ -44,6 +51,10 @@
Feedzirra::Feed.determine_feed_parser_for_xml(sample_rdf_feed).should == Feedzirra::RDF
end
+ it "should return the Feedzirra::RDF class for an rss feedburner feed" do
+ Feedzirra::Feed.determine_feed_parser_for_xml(sample_rss_feed_burner_feed).should == Feedzirra::RDF
+ end
+
it "should return the Feedzirra::RSS object for an rss 2.0 feed" do
Feedzirra::Feed.determine_feed_parser_for_xml(sample_rss_feed).should == Feedzirra::RSS
end
@@ -13,6 +13,7 @@
:url => opml_entry.attributes["htmlUrl"].to_s)
end
+urls = []
multi = Curl::Multi.new
feeds.each do |feed|
on_failure = lambda do |ex|
@@ -23,11 +24,11 @@
on_success = lambda do |body|
puts "got #{feed.title} - #{feed.feed_url}"
- File.open("#{feed.title.gsub(/\W/, "")}.xml", "w") do |f|
- f.write(body)
- end
+ urls << feed.feed_url
end
multi.get(feed.feed_url, on_success, on_failure)
end
-multi.select([], []) while multi.size > 0
+multi.select([], []) while multi.size > 0
+
+File.open("successful_feed_urls.txt", "w") {|f| f.write(urls.join("\n"))}
@@ -0,0 +1,180 @@
+http://feeds.feedburner.com/CoryForsyth
+http://feeds.feedburner.com/dcmanges
+http://www.allthingsdistributed.com/index.xml
+http://feeds.feedburner.com/pmarca
+http://aws.typepad.com/aws/atom.xml
+http://blog.caboo.se/feed/atom.xml
+http://www.avibryant.com/index.rdf
+http://joyeur.com/atom/
+http://hunch.net/?feed=rss2
+http://feeds.feedburner.com/Nanorails
+http://feeds.feedburner.com/PaulDixExplainsNothing
+http://feeds.feedburner.com/SamHarris
+http://rubyforge.org/export/rss_sfnewreleases.php
+http://mike.bailey.net.au/blog/?feed=rss2
+http://rubyforge.org/export/rss_sfnews.php
+http://feeds.feedburner.com/TomWhite
+http://pauldowman.com/feed/
+http://railsontherun.com/feed/atom.xml
+http://feeds.feedburner.com/devver/blog
+http://blogsearch.google.com/blogsearch_feeds?hl=en&amp;scoring=d&amp;q=%22paul+dix%22&amp;ie=utf-8&amp;output=atom
+http://feeds.feedburner.com/HacketyOrg
+http://jobs.joelonsoftware.com/default.asp?pg=pgFeed&amp;feed=9095128
+http://gslounge.com/blog/feed
+http://www.oreillynet.com/pub/feed/89
+http://feeds.feedburner.com/activereload
+http://jobs.37signals.com/jobs.rss
+http://feeds.feedburner.com/al3x
+http://www.mysqlperformanceblog.com/feed/
+http://soylentfoo.jnewland.com/xml/rss20/feed.xml
+http://feeds.feedburner.com/blogspot/aefO
+http://tweetscan.com/rss.php?s=pauldix
+http://adam.blogs.bitscribe.net/feed/
+http://feeds.feedburner.com/newbamboo
+http://www.postal-code.com/binarycode/feed/
+http://brainspl.at/xml/rss20/feed.xml
+http://feeds.feedburner.com/brynary
+http://feeds.feedburner.com/CoryFoy
+http://feeds.feedburner.com/Chadfowlercom
+http://feeds.feedburner.com/rubypal/KoEa
+http://irthoughts.wordpress.com/feed/
+http://cfis.savagexi.com/articles.atom
+http://www.danwebb.net/feed/atom.xml
+http://feeds.feedburner.com/encytemedia
+http://feeds.feedburner.com/errtheblog
+http://www.eribium.org/blog/?feed=rss2
+http://feeds.feedburner.com/FingerprintsOfCasperFabricius
+http://blog.rapleaf.com/dev/?feed=rss2
+http://codemode.blogspot.com/feeds/posts/default
+http://feeds.feedburner.com/GiantRobotsSmashingIntoOtherGiantRobots
+http://feeds.feedburner.com/hasmanythrough
+http://blog.imperialdune.com/feed/atom.xml
+http://blog.craigambrose.com/xml/rss20/feed.xml
+http://www.infoq.com/rss/rss.action?token=M7lRPBznVOdzQgBkfJsR2LMOUm72X9hp
+http://feeds.feedburner.com/JamesBritt-Home
+http://fhwang.net/syndicate/ruby.atom
+http://www.urbanhonking.com/ideasfordozens/atom.xml
+http://www.jrmiii.com/feed/atom.xml
+http://www.jonsthoughtsoneverything.com/feed/
+http://feeds.feedburner.com/mongoo/CTIN
+http://everburning.com/feed/
+http://feeds.feedburner.com/LoudThinking
+http://rubylearning.com/blog/feed/
+http://feeds.feedburner.com/MartinFowlersBliki
+http://jicksta.com/feed
+http://www.bofh.org.uk/articles.atom
+http://mike.daless.io/aintablog/articles.rss
+http://feeds.feedburner.com/NickSieger
+http://dev.massivebraingames.com/rss
+http://mikepence.wordpress.com/feed/
+http://rubyphilia.wordpress.com/feed/
+http://www.nimblecode.com/xml/rss/feed.xml
+http://feeds.feedburner.com/pluron
+http://lifecoding.com/blog/?feed=rss2
+http://sam.aaron.name/feed/atom.xml
+http://feeds.feedburner.com/objo
+http://feeds.feedburner.com/ozmmorg
+http://www.notsostupid.com/feed/
+http://on-ruby.blogspot.com/atom.xml
+http://oneless.blogspot.com/feeds/posts/default
+http://onrails.org/xml/rss20/feed.xml
+http://blog.pastie.org/index.rdf
+http://ola-bini.blogspot.com/atom.xml
+http://tomcopeland.blogs.com/juniordeveloper/atom.xml
+http://blog.fallingsnow.net/feed/
+http://pitsula.blogspot.com/feeds/posts/default
+http://lylejohnson.name/blog/feed/
+http://feeds.feedburner.com/prototype-blog
+http://planetruby.0x42.net/rss20.xml
+http://blog.zenspider.com/atom.xml
+http://pragdave.pragprog.com/pragdave/atom.xml
+http://feeds.feedburner.com/rails-envy
+http://feeds.feedburner.com/railsjitsu
+http://feeds.feedburner.com/riab
+http://feeds.feedburner.com/RobertREvans
+http://feeds.feedburner.com/reinh
+http://blog.methodmissing.com/feed/atom.xml
+http://redhanded.hobix.com/index.xml
+http://feeds.feedburner.com/RubyOnRailsSecurity
+http://feeds.feedburner.com/RidingRails
+http://feeds.feedburner.com/ruby_is_awesome
+http://feeds.feedburner.com/rubypond/JXRc
+http://www.rubycorner.com/feeds/updated/rss20
+http://rubyonwindows.blogspot.com/feeds/posts/default
+http://rubyquiz.com/index.rss
+http://ruby.tie-rack.org/feed/
+http://www.rubyhead.com/feed/
+http://feeds.feedburner.com/37signals/beMH
+http://feeds.feedburner.com/SimplisticComplexity
+http://feeds.feedburner.com/slash7/rss
+http://smartic.us/feed/atom.xml
+http://cuttingtheredtape.blogspot.com/feeds/posts/default
+http://feeds.feedburner.com/sneer/blog
+http://www.spacebabies.nl/feed/
+http://feeds.feedburner.com/cleanair
+http://richkilmer.blogs.com/ether/atom.xml
+http://tenderlovemaking.com/feed/
+http://feeds.feedburner.com/terralien-ships-log
+http://feeds.feedburner.com/ZenAndTheArtOfRubyProgramming
+http://feeds.feedburner.com/rufytech
+http://feeds.feedburner.com/WorkingWithRails
+http://www.yup.com/xml/atom10/feed.xml
+http://feeds.feedburner.com/StakeVentures
+http://metaclass.org/feed/atom.xml
+http://feeds.feedburner.com/therailsist
+http://www.onestepback.org/index.cgi/index.rss
+http://merbist.com/feed/
+http://feeds.feedburner.com/kevingc
+http://feeds.feedburner.com/nuttnet/qWLn
+http://brontemedia.com/feed/
+http://feeds.feedburner.com/brynary
+http://feeds.feedburner.com/AmitGuptasWeblog
+http://brighter.net/rss
+http://feeds.feedburner.com/gilesbowkett
+http://feeds.feedburner.com/hasmanythrough
+http://probablycorey.wordpress.com/feed/
+http://feeds.feedburner.com/innonate
+http://feeds.feedburner.com/Kungpowthinking
+http://nikocunningham.blogspot.com/feeds/posts/default
+http://www.notsostupid.com/feed/
+http://feeds.feedburner.com/slash7/rss
+http://feeds.feedburner.com/ThirdRail
+http://feeds.feedburner.com/trottercashion
+http://weblogs.java.net/blog/arungupta/index.rdf
+http://fabiokung.com/feed/
+http://feeds.feedburner.com/nicksieger
+http://metaclass.org/feed/atom.xml
+http://fhwang.net/syndicate/ruby.atom
+http://ola-bini.blogspot.com/atom.xml
+http://irthoughts.wordpress.com/feed/
+http://www.zedshaw.com/feed.atom
+http://feeds.feedburner.com/devthatweb
+http://ross.typepad.com/blog/atom.xml
+http://www.pbs.org/cringely/pulpit/rss2.xml
+http://rss.slashdot.org/slashdot/eqWf
+http://www.zedshaw.com/feed.atom
+http://www.scripting.com/rss.xml
+http://www.techmeme.com/index.xml
+http://feeds.feedburner.com/LinkBuildingBlog
+http://feeds.feedburner.com/AmitGuptasWeblog
+http://codesnipers.com/?q=node/feed
+http://feeds.feedburner.com/hermanshead76
+http://www.joelonsoftware.com/rss.xml
+http://feeds.feedburner.com/InformationArbitrage
+http://paulgraham.infogami.com/blog/atom.xml
+http://feeds.feedburner.com/startupping
+http://feeds.feedburner.com/NewYorkSmallBusinessLaw
+http://www.userscape.com/blog/index.php/site/rss_2.0//rss2/
+http://www.alistapart.com/feed/rss.xml
+http://lsvp.wordpress.com/feed/
+http://www.microisv.com/feed/
+http://feeds.b5media.com/b5media/StartupSpark
+http://pragmatictheory.blogspot.com/feeds/posts/default
+http://fiveyearstoolate.wordpress.com/feed/
+http://feeds.feedburner.com/blogniscient
+http://feeds.feedburner.com/Feedblog
+http://feeds.feedburner.com/Tailrank
+http://www.regator.com/blog/?feed=rss2
+http://feeds.feedburner.com/Spinn3r
+http://blog.spotback.com/feed/
+http://irthoughts.wordpress.com/feed/
View
@@ -25,6 +25,10 @@ def sample_rdf_entry_content
File.read("#{File.dirname(__FILE__)}/sample_feeds/HREFConsideredHarmfulFirstEntry.xml")
end
+def sample_rss_feed_burner_feed
+ File.read("#{File.dirname(__FILE__)}/sample_feeds/SamHarrisAuthorPhilosopherEssayistAtheist.xml")
+end
+
def sample_rss_feed
File.read("#{File.dirname(__FILE__)}/sample_feeds/TenderLovemaking.xml")
end

0 comments on commit 65b63d0

Please sign in to comment.