Merge pull request jekyll#437 from stephenmcd/master

Improved Tumblr Support
nickh · Jan 22, 2012 · d36a106 · d36a106
2 parents 456a9f3 + f2502df
commit d36a106
Showing 1 changed file with 175 additions and 99 deletions.
diff --git a/lib/jekyll/migrators/tumblr.rb b/lib/jekyll/migrators/tumblr.rb
@@ -1,119 +1,195 @@
 require 'rubygems'
-require 'nokogiri'
 require 'open-uri'
 require 'fileutils'
-require 'CGI'
-require 'iconv'
+require 'nokogiri'
 require 'date'
+require 'json'
+require 'uri'
+require 'jekyll'
 
 module Jekyll
   module Tumblr
-    def self.process(url, grab_images = false)
-      current_page = 0
-
-      while true
-        f = open(url + "/api/read?num=50&start=#{current_page * 50}")
-        doc = Nokogiri::HTML(Iconv.conv("utf-8", f.charset, f.readlines.join("\n")))
-
-        puts "Page: #{current_page + 1} - Posts: #{(doc/:tumblr/:posts/:post).size}"
-
-        FileUtils.mkdir_p "_posts/tumblr"
-
-        (doc/:tumblr/:posts/:post).each do |post|
-          title = ""
-          content = nil
-          name = nil
-
-          if post['type'] == "regular"
-            title_element = post.at("regular-title")
-            title = title_element.inner_text unless title_element == nil
-            content = CGI::unescapeHTML post.at("regular-body").inner_html unless post.at("regular-body") == nil
-          elsif post['type'] == "link"
-            title = post.at("link-text").inner_html unless post.at("link-text") == nil
-
-            if post.at("link-text") != nil
-              content = "<a href=\"#{post.at("link-url").inner_html}\">#{post.at("link-text").inner_html}</a>"
-            else
-              content = "<a href=\"#{post.at("link-url").inner_html}\">#{post.at("link-url").inner_html}</a>"
-            end
-
-            content << "<br/>" + CGI::unescapeHTML(post.at("link-description").inner_html) unless post.at("link-description") == nil
-          elsif post['type'] == "photo"
-            content = ""
-
-            if post.at("photo-link-url") != nil
-              content = "<a href=\"#{post.at("photo-link-url").inner_html}\"><img src=\"#{save_file((post/"photo-url")[1].inner_html, grab_images)}\"/></a>"
-            else
-              content = "<img src=\"#{save_file((post/"photo-url")[1].inner_html, grab_images)}\"/>"
-            end
-
-            if post.at("photo-caption") != nil
-              content << "<br/>" unless content == nil
-              content << CGI::unescapeHTML(post.at("photo-caption").inner_html)
-            end
-          elsif post['type'] == "audio"
-            content = CGI::unescapeHTML(post.at("audio-player").inner_html)
-            content << CGI::unescapeHTML(post.at("audio-caption").inner_html) unless post.at("audio-caption") == nil
-          elsif post['type'] == "quote"
-            content = "<blockquote>" + CGI::unescapeHTML(post.at("quote-text").inner_html) + "</blockquote>"
-            content << "&#8212;" + CGI::unescapeHTML(post.at("quote-source").inner_html) unless post.at("quote-source") == nil
-          elsif post['type'] == "conversation"
-            title = post.at("conversation-title").inner_html unless post.at("conversation-title") == nil
-            content = "<section><dialog>"
-
-            (post/:conversation/:line).each do |line|
-              content << "<dt>" + line['label'] + "</dt><dd>" + line.inner_html + "</dd>" unless line['label'] == nil || line == nil
-            end
-
-            content << "</section></dialog>"
-          elsif post['type'] == "video"
-            title = post.at("video-title").inner_html unless post.at("video-title") == nil
-            content = CGI::unescapeHTML(post.at("video-player").inner_html)
-            content << CGI::unescapeHTML(post.at("video-caption").inner_html) unless post.at("video-caption") == nil
-          end # End post types
-
-          name = "#{Date.parse(post['date']).to_s}-#{post['id'].downcase.gsub(/[^a-z0-9]/, '-')}.html"
-
-          if title != nil || content != nil && name != nil
-            File.open("_posts/tumblr/#{name}", "w") do |f|
-
-              f.puts <<-HEADER
----
-layout: post
-title: #{title}
----
+    def self.process(url, format = "html", grab_images = false,
+                     add_highlights = false, rewrite_urls = true)
+      @grab_images = grab_images
+      FileUtils.mkdir_p "_posts/tumblr"
+      url += "/api/read/json/"
+      per_page = 50
+      posts = []
+      # Two passes are required so that we can rewrite URLs.
+      # First pass builds up an array of each post as a hash.
+      begin
+        current_page = (current_page || -1) + 1
+        feed = open(url + "?num=#{per_page}&start=#{current_page * per_page}")
+        json = feed.readlines.join("\n")[21...-2]  # Strip Tumblr's JSONP chars.
+        blog = JSON.parse(json)
+        puts "Page: #{current_page + 1} - Posts: #{blog["posts"].size}"
+        posts += blog["posts"].map { |post| post_to_hash(post, format) }
+      end until blog["posts"].size < per_page
+      # Rewrite URLs and create redirects.
+      posts = rewrite_urls_and_redirects posts if rewrite_urls
+      # Second pass for writing post files.
+      posts.each do |post|
+        if format == "md"
+          post[:content] = html_to_markdown post[:content]
+          post[:content] = add_syntax_highlights post[:content] if add_highlights
+        end
+        File.open("_posts/tumblr/#{post[:name]}", "w") do |f|
+          f.puts post[:header].to_yaml + "---\n" + post[:content]
+        end
+      end
+    end
 
-HEADER
+    private
 
-              f.puts content
-            end # End file
+    # Converts each type of Tumblr post to a hash with all required
+    # data for Jekyll.
+    def self.post_to_hash(post, format)
+      case post['type']
+        when "regular"
+          title = post["regular-title"]
+          content = post["regular-body"]
+        when "link"
+          title = post["link-text"] || post["link-url"]
+          content = "<a href=\"#{post["link-url"]}\">#{title}</a>"
+          unless post["link-description"].nil?
+            content << "<br/>" + post["link-description"]
           end
+        when "photo"
+          title = post["photo-caption"]
+          max_size = post.keys.map{ |k| k.gsub("photo-url-", "").to_i }.max
+          url = post["photo-url"] || post["photo-url-#{max_size}"]
+          ext = "." + post[post.keys.select { |k|
+            k =~ /^photo-url-/ && post[k].split("/").last =~ /\./
+          }.first].split(".").last
+          content = "<img src=\"#{save_file(url, ext)}\"/>"
+          unless post["photo-link-url"].nil?
+            content = "<a href=\"#{post["photo-link-url"]}\">#{content}</a>"
+          end
+        when "audio"
+          if !post["id3-title"].nil?
+            title = post["id3-title"]
+            content = post.at["audio-player"] + "<br/>" + post["audio-caption"]
+          else
+            title = post["audio-caption"]
+            content = post.at["audio-player"]
+          end
+        when "quote"
+          title = post["quote-text"]
+          content = "<blockquote>#{post["quote-text"]}</blockquote>"
+          unless post["quote-source"].nil?
+            content << "&#8212;" + post["quote-source"]
+          end
+        when "conversation"
+          title = post["conversation-title"]
+          content = "<section><dialog>"
+          post["conversation"]["line"].each do |line|
+            content << "<dt>#{line['label']}</dt><dd>#{line}</dd>"
+          end
+          content << "</section></dialog>"
+        when "video"
+          title = post["video-title"]
+          content = post["video-player"]
+          unless post["video-caption"].nil?
+            content << "<br/>" + post["video-caption"]
+          end
+      end
+      date = Date.parse(post['date']).to_s
+      title = Nokogiri::HTML(title).text
+      slug = title.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
+      {
+        :name => "#{date}-#{slug}.#{format}",
+        :header => {
+          "layout" => "post",
+          "title" => title,
+          "tags" => post["tags"],
+        },
+        :content => content,
+        :url => post["url"],
+        :slug => post["url-with-slug"],
+      }
+    end
 
-        end # End post XML
-
-        if (doc/:tumblr/:posts/:post).size < 50
-          break
-        else
-          current_page = current_page + 1
+    # Create a Hash of old urls => new urls, for rewriting and
+    # redirects, and replace urls in each post. Instantiate Jekyll
+    # site/posts to get the correct permalink format.
+    def self.rewrite_urls_and_redirects(posts)
+      site = Jekyll::Site.new(Jekyll.configuration({}))
+      dir = File.join(File.dirname(__FILE__), "..")
+      urls = Hash[posts.map { |post|
+        # Create an initial empty file for the post so that
+        # we can instantiate a post object.
+        File.open("_posts/tumblr/#{post[:name]}", "w")
+        tumblr_url = URI.parse(post[:slug]).path
+        jekyll_url = Jekyll::Post.new(site, dir, "", "tumblr/" + post[:name]).url
+        redirect_dir = tumblr_url.sub(/\//, "") + "/"
+        FileUtils.mkdir_p redirect_dir
+        File.open(redirect_dir + "index.html", "w") do |f|
+          f.puts "<html><head><meta http-equiv='Refresh' content='0; " +
+                 "url=#{jekyll_url}'></head><body></body></html>"
         end
+        [tumblr_url, jekyll_url]
+      }]
+      posts.map { |post|
+        urls.each do |tumblr_url, jekyll_url|
+          post[:content].gsub!(/#{tumblr_url}/i, jekyll_url)
+        end
+        post
+      }
+    end
 
-      end # End while loop
-    end # End method
-
-    private
-
-    def self.save_file(url, grab_image = false)
-      unless grab_image == false
-        FileUtils.mkdir_p "tumblr_files"
+    # Uses Python's html2text to convert a post's content to
+    # markdown. Preserve HTML tables as per the markdown docs.
+    def self.html_to_markdown(content)
+      preserve = ["table", "tr", "th", "td"]
+      preserve.each do |tag|
+        content.gsub!(/<#{tag}/i, "$$" + tag)
+        content.gsub!(/<\/#{tag}/i, "||" + tag)
+      end
+      content = %x[echo '#{content.gsub("'", "''")}' | html2text]
+      preserve.each do |tag|
+        content.gsub!("$$" + tag, "<" + tag)
+        content.gsub!("||" + tag, "</" + tag)
+      end
+      content
+    end
 
-        File.open("tumblr_files/#{url.split('/').last}", "w") do |f|
-          f.write(open(url).read)
+    # Adds pygments highlight tags to code blocks in posts that use
+    # markdown format. This doesn't guess the language of the code
+    # block, so you should modify this to suit your own content.
+    # For example, my code block only contain Python and JavaScript,
+    # so I can assume the block is JavaScript if it contains a
+    # semi-colon.
+    def self.add_syntax_highlights(content)
+      lines = content.split("\n")
+      block, indent, lang, start = false, /^    /, nil, nil
+      lines.each_with_index do |line, i|
+        if !block && line =~ indent
+          block = true
+          lang = "python"
+          start = i
+        elsif block
+          lang = "javascript" if line =~ /;$/
+          block = line =~ indent && i < lines.size - 1 # Also handle EOF
+          if !block
+            lines[start] = "{% highlight #{lang} %}"
+            lines[i - 1] = "{% endhighlight %}"
+          end
+          lines[i] = lines[i].sub(indent, "")
         end
+      end
+      lines.join("\n")
+    end
 
-        return "/tumblr_files/#{url.split('/').last}"
-      else
-        return url
+    def self.save_file(url, ext)
+      if @grab_images
+        path = "tumblr_files/#{url.split('/').last}"
+        path += ext unless path =~ /#{ext}$/
+        FileUtils.mkdir_p "tumblr_files"
+        File.open(path, "w") { |f| f.write(open(url).read) }
+        url = "/" + path
       end
+      url
     end
   end
 end