Permalink
Browse files

rewrite batching and generation of sitemaps

  • Loading branch information...
1 parent 3350edb commit 44dcae4320969dc4ded3924eb13256d0cce8c26d @mislav committed Apr 3, 2009
Showing with 213 additions and 152 deletions.
  1. +159 −152 lib/big_sitemap.rb
  2. +54 −0 spec/big_sitemap_spec.rb
View
@@ -1,210 +1,217 @@
-require 'net/http'
require 'uri'
require 'zlib'
require 'builder'
-require 'extlib'
require 'fileutils'
class BigSitemap
DEFAULTS = {
:max_per_sitemap => 50000,
:batch_size => 1001,
- :path => 'sitemaps',
+ :gzip => true, # set false to inspect results
# opinionated
:ping_google => true,
:ping_yahoo => false, # needs :yahoo_app_id
- :ping_msn => false,
- :ping_ask => false,
+ :ping_msn => false
}
- include FileUtils
+ TIMESTAMP_COLUMNS = %w(updated_at updated_on updated created_at created_on created)
- def initialize(options)
- @options = DEFAULTS.merge options
-
- unless @options[:base_url]
- raise ArgumentError, 'Base URL must be specified with the ":base_url" option'
+ include ActionController::UrlWriter
+
+ class Builder < Builder::XmlMarkup
+ # add support for:
+ # xml.open_foo!(attrs)
+ # xml.close_foo!
+ def method_missing(method, *args, &block)
+ if method.to_s =~ /^(open|close)_(.+)!$/
+ operation, name = $1, $2
+ name = "#{name}:#{args.shift}" if Symbol === args.first
+
+ if 'open' == operation
+ _indent
+ _start_tag(name, args.first)
+ _newline
+ @level += 1
+ else
+ @level -= 1
+ _indent
+ _end_tag(name)
+ _newline
+ end
+ else
+ super
+ end
end
+ end
+
+ def initialize(options = {})
+ @options = DEFAULTS.merge options
if @options[:batch_size] > @options[:max_per_sitemap]
raise ArgumentError, '":batch_size" must be less than ":max_per_sitemap"'
end
- @options[:document_root] ||= begin
- if defined? Rails
- "#{Rails.root}/public"
- elsif defined? Merb
- "#{Merb.root}/public"
- end
+ if @options[:url_options]
+ default_url_options.update @options[:url_options]
+ elsif @options[:base_url]
+ uri = URI.parse(@options[:base_url])
+ default_url_options[:host] = uri.host
+ default_url_options[:port] = uri.port
+ default_url_options[:protocol] = uri.scheme
+ else
+ raise ArgumentError, 'you must specify either ":url_options" hash or ":base_url" string'
end
- unless @options[:document_root]
- raise ArgumentError, 'Document root must be specified with the ":document_root" option'
- end
-
- @file_path = "#{@options[:document_root]}/#{@options[:path]}"
+ @root = @options[:document_root] || Rails.public_path
@sources = []
+ @sitemap_files = []
+
+ # W3C format is the subset of ISO 8601
+ Time::DATE_FORMATS[:sitemap] = lambda { |time|
+ time.strftime "%Y-%m-%dT%H:%M:%S#{time.formatted_offset(true, 'Z')}"
+ }
end
- def add(options)
- unless options[:model] and options[:path]
- raise ArgumentError, 'please provide ":model" and ":path"'
- end
-
- @sources << options.dup
- return self
+ def add(model, options = {})
+ @sources << [model, options.dup]
end
def clean
- rm_r @file_path
- return self
+ Dir["#{@root}/sitemap_*.{xml,xml.gz}"].each do |file|
+ FileUtils.rm file, :verbose => true
+ end
end
def generate
- @sources.each do |source|
- klass = source[:model]
-
- count_method = pick_method(klass, [:count_for_sitemap, :count])
- find_method = pick_method(klass, [:find_for_sitemap, :all])
- raise ArgumentError, "#{klass} must provide a count_for_sitemap class method" if count_method.nil?
- raise ArgumentError, "#{klass} must provide a find_for_sitemap class method" if find_method.nil?
-
- count = klass.send(count_method)
- num_sitemaps = 1
- num_batches = 1
-
- if count > @batch_size
- num_batches = (count.to_f / @batch_size.to_f).ceil
- num_sitemaps = (count.to_f / @max_per_sitemap.to_f).ceil
- end
- batches_per_sitemap = num_batches.to_f / num_sitemaps.to_f
-
- # Update the @sources hash so that the index file knows how many sitemaps to link to
- source[:num_sitemaps] = num_sitemaps
-
- for sitemap_num in 1..num_sitemaps
- # Work out the start and end batch numbers for this sitemap
- batch_num_start = sitemap_num == 1 ? 1 : ((sitemap_num * batches_per_sitemap).ceil - batches_per_sitemap + 1).to_i
- batch_num_end = (batch_num_start + [batches_per_sitemap, num_batches].min).floor - 1
-
- # Stream XML output to a file
- filename = "sitemap_#{Extlib::Inflection::underscore(klass.to_s)}"
- filename << "_#{sitemap_num}" if num_sitemaps > 1
-
- gz = gz_writer("#{filename}.xml.gz")
-
- xml = Builder::XmlMarkup.new(:target => gz)
- xml.instruct!
- xml.urlset(:xmlns => 'http://www.sitemaps.org/schemas/sitemap/0.9') do
- for batch_num in batch_num_start..batch_num_end
- offset = ((batch_num - 1) * @batch_size)
- limit = (count - offset) < @batch_size ? (count - offset - 1) : @batch_size
- find_options = num_batches > 1 ? {:limit => limit, :offset => offset} : {}
-
- klass.send(find_method, find_options).each do |r|
- last_mod_method = pick_method(
- r,
- [:updated_at, :updated_on, :updated, :created_at, :created_on, :created]
- )
- last_mod = last_mod_method.nil? ? Time.now : r.send(last_mod_method)
-
- param_method = pick_method(r, [:to_param, :id])
- raise ArgumentError, "#{klass} must provide a to_param instance method" if param_method.nil?
-
- xml.url do
- xml.loc("#{@base_url}/#{source[:path]}/#{r.send(param_method)}")
- xml.lastmod(last_mod.strftime('%Y-%m-%d')) unless last_mod.nil?
- xml.changefreq('weekly')
- end
- end
- end
+ for model, options in @sources
+ with_sitemap(model.name.tableize) do
+ find_options = options.dup
+ changefreq = find_options.delete(:change_frequency) || 'weekly'
+ find_options[:batch_size] ||= @options[:batch_size]
+ timestamp_column = model.column_names.find { |col| TIMESTAMP_COLUMNS.include? col }
+
+ model.find_each(find_options) do |record|
+ last_updated = record.read_attribute(timestamp_column)
+ add_url(polymorphic_url(record), last_updated, changefreq)
end
-
- gz.close
end
-
end
generate_sitemap_index
- ping_search_engines
- self # Chainable
end
private
- def pick_method(klass, candidates)
- method = nil
- candidates.each do |candidate|
- if klass.respond_to? candidate
- method = candidate
- break
- end
+
+ def with_sitemap(name)
+ @sitemap = "sitemap_#{name}"
+ @parts = 0
+ @urls = 0
+ init_part
+ begin
+ yield
+ ensure
+ close_part
end
- method
end
-
- def gz_writer(filename)
- Zlib::GzipWriter.new(File.open("#{@file_path}/#{filename}", 'w+'))
+
+ def init_part
+ part_filename = @sitemap
+ part_filename += "_#{@parts}" if @parts > 0
+
+ @xml = Builder.new(:target => xml_open(part_filename), :indent => 2)
+ @xml.instruct!
+ @xml.open_urlset!(:xmlns => 'http://www.sitemaps.org/schemas/sitemap/0.9')
end
-
- def sitemap_index_filename
- 'sitemap_index.xml.gz'
+
+ def add_url(url, time, freq)
+ rotate_parts! if @options[:max_per_sitemap] == @urls
+
+ @xml.url do
+ @xml.loc url
+ @xml.lastmod time.to_s(:sitemap)
+ @xml.changefreq freq
+ end
+ @urls += 1
+ end
+
+ def close_part
+ @xml.close_urlset!
+ @xml.target!.close
+ end
+
+ def rotate_parts!
+ close_part
+ @urls = 0
+ @parts += 1
+ init_part
+ end
+
+ def xml_open(filename)
+ filename += '.xml'
+ filename << '.gz' if @options[:gzip]
+ file = File.open("#{@root}/#{filename}", 'w+')
+ @sitemap_files << file.path
+ writer = @options[:gzip] ? Zlib::GzipWriter.new(file) : file
+
+ if block_given?
+ yield writer
+ writer.close
+ end
+ writer
end
- # Create a sitemap index document
def generate_sitemap_index
- xml = ''
- builder = Builder::XmlMarkup.new(:target => xml)
- builder.instruct!
- builder.sitemapindex(:xmlns => 'http://www.sitemaps.org/schemas/sitemap/0.9') do
- @sources.each do |source|
- num_sitemaps = source[:num_sitemaps]
- for i in 1..num_sitemaps
- loc = "#{@base_url}/#{@web_path}/sitemap_#{Extlib::Inflection::underscore(source[:model].to_s)}"
- loc << "_#{i}" if num_sitemaps > 1
- loc << '.xml.gz'
-
- builder.sitemap do
- builder.loc(loc)
- builder.lastmod(Time.now.strftime('%Y-%m-%d'))
+ xml_open 'sitemap_index' do |file|
+ xml = Builder.new(:target => file, :indent => 2)
+ xml.instruct!
+ xml.sitemapindex(:xmlns => 'http://www.sitemaps.org/schemas/sitemap/0.9') do
+ for path in @sitemap_files[0..-2]
+ xml.sitemap do
+ xml.loc url_for_sitemap(path)
+ xml.lastmod File.stat(path).mtime.to_s(:sitemap)
end
end
end
end
-
- gz = gz_writer(sitemap_index_filename)
- gz.write(xml)
- gz.close
end
-
- def sitemap_uri
- URI.escape("#{@base_url}/#{@web_path}/#{sitemap_index_filename}")
- end
-
- # Notify Google of the new sitemap index file
- def ping_google
- Net::HTTP.get('www.google.com', "/webmasters/tools/ping?sitemap=#{sitemap_uri}")
- end
-
- # Notify Yahoo! of the new sitemap index file
- def ping_yahoo
- Net::HTTP.get('search.yahooapis.com', "/SiteExplorerService/V1/updateNotification?appid=#{@yahoo_app_id}&url=#{sitemap_uri}")
- end
-
- # Notify MSN of the new sitemap index file
- def ping_msn
- Net::HTTP.get('webmaster.live.com', "/ping.aspx?siteMap=#{sitemap_uri}")
+
+ def url_for_sitemap(path)
+ root_url + File.basename(path)
end
-
- # Notify Ask of the new sitemap index file
- def ping_ask
- Net::HTTP.get('submissions.ask.com', "/ping?sitemap=#{sitemap_uri}")
+
+ def root_url
+ @root_url ||= begin
+ url = ''
+ url << (default_url_options[:protocol] || 'http')
+ url << '://' unless url.match('://')
+ url << default_url_options[:host]
+ url << ":#{port}" if port = default_url_options[:port] and port != 80
+ url << '/'
+ end
end
def ping_search_engines
- ping_google if @ping_google
- ping_yahoo if @ping_yahoo && @yahoo_app_id
- ping_msn if @ping_msn
- ping_ask if @ping_ask
+ require 'net/http'
+ require 'cgi'
+
+ sitemap_uri = CGI::escape(url_for_sitemap(@sitemap_files.last))
+
+ if @options[:ping_google]
+ Net::HTTP.get('www.google.com', "/webmasters/tools/ping?sitemap=#{sitemap_uri}")
+ end
+
+ if @options[:ping_yahoo]
+ if @options[:yahoo_app_id]
+ Net::HTTP.get('search.yahooapis.com', "/SiteExplorerService/V1/updateNotification?" +
+ "appid=#{@options[:yahoo_app_id]}&url=#{sitemap_uri}")
+ else
+ $stderr.puts 'unable to ping Yahoo: no ":yahoo_app_id" provided'
+ end
+ end
+
+ if @options[:ping_msn]
+ Net::HTTP.get('webmaster.live.com', "/ping.aspx?siteMap=#{sitemap_uri}")
+ end
end
end
Oops, something went wrong.

0 comments on commit 44dcae4

Please sign in to comment.