Browse files

initial commit

  • Loading branch information...
0 parents commit 4a919c1f9049e7a44b4f90b4764b639f2f84e4f0 @mattharrison committed Jun 28, 2012
Showing with 338 additions and 0 deletions.
  1. +79 −0 README.rst
  2. +224 −0 ebookgen.py
  3. +35 −0 test.py
79 README.rst
@@ -0,0 +1,79 @@
+=================
+ ebookgenerators
+=================
+
+This module is useful as an example of real world generators. They
+have been used to some degree of success to help my mother. She's an
+avid reader and kindle owner. She enjoys that she can up the font on
+her kindle. She obtained some pdf's that were quite painful to read in
+dead tree and on the kindle. Being the kind son I am I translated them
+to the mobi format and emailed them to her kindle.
+
+I've used these generators on about a dozen pdfs. Some tweaking is
+required because the output of each pdf seems a bit different. YMMV
+
+pdf to mobi
+===========
+
+My process went like this:
+
+* use ``pdf2txt.py`` to get text of pdf files.
+* inspect txt files and tweak to get them to rst if possible
+* create a python generator pipeline to clean up text (using ``ebookgen``)
+* use ``rst2epub2`` or ``rst2html`` to create mobi file
+* email mobi file to someone@kindle.com
+
+For normal people
+=================
+
+If you don't care about ebooks that much (or making your own), then
+this library might be interesting as examples of real-world
+generators. Of particular interest might be the ``Peeker`` class which
+allows looking ahead in iterators. (This comes in often when tweaking
+text).
+
+There are also a few examples of basic generators and some fancier
+ones that use ``Peeker``.
+
+These generators aren't necessarily written in a functional style
+using *map*, *reduce* and *filter*, though they probably could
+be. Sorry.
+
+Creating a generator chain
+==========================
+
+My script to clean the ``pdf2txt.py`` output looks something like this::
+
+
+ import sys
+
+ import ebookgen
+
+
+ def run():
+ data = sys.stdin
+ data = ebookgen.remove_leading_space(data)
+ data = ebookgen.remove_dash_page(data)
+ data = ebookgen.remove_carot_l(data)
+ data = ebookgen.remove_two_spaces(data)
+ data = ebookgen.remove_double_returns(data)
+ data = ebookgen.insert_extra_paragraph_line(data)
+ data = ebookgen.insert_rst_sections(data)
+
+ for line in data:
+ print line,
+
+ if __name__ == '__main__':
+ run()
+
+
+License
+=======
+
+MIT
+
+Copyright
+=========
+
+Matt Harrison, 2012
+
224 ebookgen.py
@@ -0,0 +1,224 @@
+# -*- coding: utf-8
+import sys
+
+
+def fix_bullets(lines):
+ """Try to convert bullets to rst style bullets, by indenting lines
+ in bullet appropriately"""
+ for line in lines:
+ if line.startswith('- '):
+ indent = 4
+ elif line.startswith('- '):
+ indent = 3
+ elif line.startswith('- '):
+ indent = 2
+ else:
+ indent = 0
+ yield line
+ continue
+ if indent:
+ buffer = []
+ yield '\n'
+ buffer.append(line)
+ while True:
+ next_line = lines.next()
+ buffer.append(" "*indent+next_line)
+ if not next_line.strip():
+ break
+ for line in buffer:
+ yield line
+
+def remove_carot_l(lines):
+ for line in lines:
+ line = line.replace(chr(12), '')
+ yield line
+
+
+def remove_two_spaces(lines):
+ for line in lines:
+ line = line.replace(' ', ' ')
+ yield line
+
+
+def remove_page(lines):
+ for line in lines:
+ if line.startswith('Page') or line.startswith('(Page'):
+ continue
+ else:
+ yield line
+
+
+def remove_foo(lines, foo, remove_blanks=True):
+ for line in lines:
+ if line.startswith(foo):
+ continue
+ else:
+ yield line
+
+
+class PeekDone(Exception):
+ pass
+
+
+class Peeker(object):
+ def __init__(self, seq, sub_peek=True):
+ self.seq = iter(seq)
+ self.buffer = []
+
+ def pop(self):
+ if self.buffer:
+ return self.buffer.pop(0)
+
+ def peek(self, n=0):
+ """ this can raise an exception if peeking off the end. be
+ aware and handle PeekDone appropriately"""
+ try:
+ if n == len(self.buffer):
+ self.buffer.append(self.seq.next())
+ except StopIteration as e:
+ raise PeekDone('Exhausted')
+ return self.buffer[n]
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ if self.buffer:
+ return self.buffer.pop(0)
+ else:
+ return self.seq.next()
+
+
+def remove_double_returns(lines):
+ lines = Peeker(lines)
+ for line in lines:
+ try:
+ next_line = lines.peek()
+ except PeekDone as e:
+ yield line
+ return
+
+ if blank(next_line):
+ yield line
+ try:
+ lines.pop()
+ except StopIteration as e:
+ pass
+ else:
+ yield line
+
+
+def remove_excess_returns(lines):
+ for line in lines:
+ if blank(line):
+ while True:
+ if not blank(line):
+ break
+ line = lines.next()
+ yield line
+
+
+def ends_paragraph(line):
+ return line.strip().endswith('.') or line.strip().endswith('"') or len(line) < 86
+
+
+def ends_sentence(line):
+ return line.endswith('.')
+
+
+def remove_leading_space(lines):
+ for line in lines:
+ yield line.lstrip()
+
+
+def blank(line):
+ return not line.strip()
+
+
+def insert_extra_paragraph_line(lines):
+ for line in lines:
+ if ends_paragraph(line):
+ yield line
+ yield '\n'
+ else:
+ yield line
+
+
+def insert_rst_sections(lines, section_char='-'):
+ """ if we have two blank lines treat as a section divider
+ """
+ lines = Peeker(lines)
+ for line in lines:
+ try:
+ line1 = lines.peek()
+ except PeekDone as e:
+ # end of content
+ yield line
+ return
+ if blank(line) and blank(line1):
+ yield line
+ yield '%s\n' % (section_char*40)
+ yield line1
+ lines.pop()
+
+ else:
+ yield line
+
+
+def fix_space_in_paragraph(lines):
+ """ If paragraphs span pages (often) then there could be extra
+ returns in the paragraphs....
+ """
+ lines = Peeker(lines)
+ prev = None
+ for line in lines:
+ try:
+ line2 = lines.peek()
+ except PeekDone as e:
+ yield line
+ return
+ try:
+ line3 = lines.peek(1)
+ except PeekDone as e:
+ yield line
+ yield line2
+ return
+ if blank(line2) and (not ends_sentence(line)):
+ # don't use line2 so pop it
+ lines.pop()
+ yield line
+
+
+def remove_dash_page(lines, prev_lines_remove=3, after_lines_remove=2):
+ """
+ fix stuff like:
+
+ end of page.
+
+
+ - 6 -
+
+ Next page...
+ """
+ lines = Peeker(lines)
+ for line in lines:
+ try:
+ for prev in range(prev_lines_remove):
+ lines.peek(prev)
+ page = lines.peek(prev_lines_remove)
+ except PeekDone as e:
+ yield line
+ continue
+ if page.startswith('-') and page.strip().endswith('-'):
+ for prev in range(prev_lines_remove):
+ lines.pop()
+ # remove page
+ lines.pop()
+ for after in range(after_lines_remove):
+ try:
+ lines.peek()
+ lines.pop()
+ except PeekDone as e:
+ continue
+ yield line
+
35 test.py
@@ -0,0 +1,35 @@
+import unittest
+
+import ebookgen
+
+class TestPeek(unittest.TestCase):
+ def test_peeker(self):
+ x = ebookgen.Peeker(range(10))
+ self.assertEqual(x.peek(), 0)
+ self.assertEqual(x.peek(1), 1)
+ self.assertEqual(x.peek(1), 1)
+ self.assertEqual(x.peek(2), 2)
+ self.assertEqual(x.next(), 0)
+ self.assertEqual(x.peek(), 1)
+ self.assertEqual(x.peek(), 1)
+ x.pop()
+ self.assertEqual(x.peek(), 2)
+ self.assertEqual(x.next(), 2)
+ self.assertEqual(list(x), range(3,10))
+
+ x = ebookgen.Peeker(range(1))
+ self.assertEqual(x.peek(), 0)
+ self.assertRaises(ebookgen.PeekDone, x.peek, 1)
+
+ x = ebookgen.Peeker(range(2))
+ self.assertEqual(x.peek(), 0)
+ self.assertEqual(x.pop(), 0)
+ self.assertEqual(x.peek(), 1)
+ self.assertEqual(x.pop(), 1)
+ self.assertRaises(ebookgen.PeekDone, x.peek)
+ self.assertRaises(StopIteration, x.next)
+ self.assertEqual(x.pop(), None)
+
+
+if __name__ == '__main__':
+ unittest.main()

0 comments on commit 4a919c1

Please sign in to comment.