Updated code, README etc.

miohtama · May 2, 2013 · 8884c14 · 8884c14
1 parent 0252991
commit 8884c14
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 38 deletions.
diff --git a/.gitignore b/.gitignore
@@ -34,6 +34,7 @@ __minitage__*
 .settings
 .project
 .pydevproject
+.DS_Store
 virtualenv.py
 venv/
 images/

diff --git a/README.rst b/README.rst
@@ -0,0 +1,60 @@
+.. contents :: :local:
+
+Introduction
+----------------
+
+This is a Python script to convert a PDF to series of images with alt texts.
+It makes the presentation suitable embedded for a blog post and reading on a mobile device and such.
+
+My workflow:
+
+* Export presentation from Keynote to PDF. On Export dialog untick *include date* and *add borders around slides*.
+
+* Convert PDFs to JPEGs using Ghostscript
+
+* Scrape `<img>` alt texts using
+
+* Insert optional image full URL prefix, so you don't need to manually link images to the hosting service
+
+* Copy-paste generated HTML to your blog post
+
+Tested with Apple Keynote exported PDFs, but the approach should work for any PDF content.
+
+Installation
+--------------
+
+Dependencies (OSX)::
+
+    sudo port install ghostscript
+
+Please note that Ghostscript 9.06 crashed for me during the export. Please upgrade to 9.07.
+
+Setting up virtualenv and insllating the code:
+
+    git clone xxx
+    cd pdf-presentation-to-html
+    curl -L -o virtualenv.py https://raw.github.com/pypa/virtualenv/master/virtualenv.py
+    python virtualenv.py venv
+    . venv/bin/activate
+    pip install pyPdf
+
+Usage
+----------
+
+Example::
+
+    . venv/bin/activate
+    python pdf2html.py test.pdf output
+
+Advanced example::
+
+    . venv/bin/activate
+    python pdf2html.py test.pdf output
+
+
+Author
+--------------
+
+Mikko Ohtamaa (`blog <https://opensourcehacker.com>`_, `Facebook <https://www.facebook.com/?q=#/pages/Open-Source-Hacker/181710458567630>`_, `Twitter <https://twitter.com/moo9000>`_, `Google+ <https://plus.google.com/u/0/103323677227728078543/>`_)
+
+
diff --git a/pdf-presentation-to-html-snippet.py → pdf2html.py b/pdf-presentation-to-html-snippet.py → pdf2html.py
@@ -1,45 +1,21 @@
 """
 
-    PDF to blog post.
-
-    Convert PDF presentation to a blog post friendly format:
-
-    - HTML snippet
-
-    - Series of extracted images
-
-    - With alt tags
-
-    Dependencies (OSX)::
-
-        sudo port install ghostscript
-
-    Installation:
-
-        git clone xxx
-        cd pdf-presentation-to-html
-        curl -L -o virtualenv.py https://raw.github.com/pypa/virtualenv/master/virtualenv.py
-        python virtualenv.py venv
-        . venv/bin/activate
-        pip install pyPdf
-
-
+    PDF to HTML converter.
 
 """
 
 import os
 import sys
-import shutil
-from StringIO import StringIO
-from collections import defaultdict
 
 import pyPdf
-
 from pyPdf.pdf import ContentStream
 from pyPdf.pdf import TextStringObject
 
 
-SLIDE_TEMPLATE = u'<div class="slide"><img src="{prefix}{src}" alt="{alt}" /></div>'
+SLIDE_TEMPLATE = u'<p class="slide"><img src="{prefix}{src}" alt="{alt}" /></p>'
+
+# You can pass Ghostscript binary to the script as an environment variable.
+GHOSTSCRIPT = os.environ.get("GHOSTSCRIPT", "gs")
 
 
 def create_images(src, target, width=620, height=480):
@@ -55,12 +31,19 @@ def create_images(src, target, width=620, height=480):
     if target.endswith("/"):
         target = target[0:-1]
 
+    # Generated filenames
     ftemplate = "%(target)s/slide%%d.jpg" % locals()
 
-    cmd = "gs -dNOPAUSE -dPDFFitPage -sDEVICE=jpeg -sOutputFile=" + ftemplate + \
-          " -dJPEGQ=70 -dDEVICEWIDTH=800 -dDEVICEHEIGHT=600  %(src)s -c quit" % locals()
+    # gs binary
+    ghostscript = GHOSTSCRIPT
+
+    # Export magic of doing
+    # Note: Ghostscript 9.06 crashed for me
+    # had to upgrade 9.07
+    # This command does magic of anti-aliasing text and settings output JPEG dimensions correctly
+    cmd = "%(ghostscript)s -dNOPAUSE -dPDFFitPage -dTextAlphaBits=4 -sDEVICE=jpeg -sOutputFile=%(ftemplate)s -dJPEGQ=80 -dDEVICEWIDTH=%(width)d -dDEVICEHEIGHT=%(height)d  %(src)s -c quit"
+    cmd = cmd % locals()  # Apply templating
 
-    print "Executing: %s" % cmd
     if os.system(cmd):
         raise RuntimeError("Command failed: %s" % cmd)
 
@@ -104,7 +87,7 @@ def extract_text(self):
 
 
 def scrape_text(src):
-    """ Read PDF file and return plain text on each page.
+    """ Read a PDF file and return plain text of each page.
 
     http://stackoverflow.com/questions/25665/python-module-for-converting-pdf-to-text
 
@@ -122,26 +105,26 @@ def scrape_text(src):
 
 
 def create_index_html(target, slides, prefix):
-    """
+    """ Generate HTML code for `<img>` tags.
     """
 
     out = open(target, "wt")
 
     print >> out, "<!doctype html>"
     for i in xrange(0, len(slides)):
         alt = slides[i]  # ALT text for this slide
-        params = dict(src=u"slide%d.jpg" % i, prefix=prefix, alt=alt)
+        params = dict(src=u"slide%d.jpg" % (i+1), prefix=prefix, alt=alt)
         line = SLIDE_TEMPLATE.format(**params)
         print >> out, line.encode("utf-8")
 
     out.close()
 
 
 def main():
-    """ """
+    """ Entry point. """
 
     if len(sys.argv) < 3:
-        sys.exit("Usage: pdf-presentation-to-html-snippet.py mypresentation.pdf targetfolder [image path prefix]")
+        sys.exit("Usage: pdf2html.py mypresentation.pdf targetfolder [image path prefix]")
 
     src = sys.argv[1]
     folder = sys.argv[2]
@@ -158,7 +141,6 @@ def main():
 
     target_html = os.path.join(folder, "index.html")
 
-    print "Creating: " + target_html
     create_index_html(target_html, alt_texts, prefix)
 
     create_images(src, folder)

diff --git a/test.pdf b/test.pdf