Skip to content

Commit

Permalink
parse_government_bill_pdf: initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Alon Levy committed Aug 21, 2010
1 parent 1d84dff commit 1796233
Show file tree
Hide file tree
Showing 9 changed files with 640 additions and 0 deletions.
8 changes: 8 additions & 0 deletions buildout.cfg
Expand Up @@ -12,6 +12,14 @@ find-links =
http://bitbucket.org/jespern/django-piston/get/tip.tar.gz#egg=django-piston
http://github.com/daonb/django-extensions/tarball/master#egg=django-extensions

[poppler]
# NOTE: poppler not in parts by default since it takes 10 minutes to build and required only
# for simple/management/parse_government_bill, and you can have them more
# easily by apt-get/yum/etc..
# poppler
recipe=zc.recipe.cmmi
url=http://poppler.freedesktop.org/poppler-0.14.2.tar.gz

[activity_stream]
recipe=zerokspot.recipe.git
repository=git://github.com/daonb/django-activity-stream.git
Expand Down
@@ -0,0 +1 @@
from parse_goverment_bill_pdf import *
@@ -0,0 +1,96 @@
""" Code to look at pdf's and check the text selection mechanism
of poppler. Left here for future reference (not used by any management
command).
"""

import os
import itertools

import gtk
import goocanvas
import gobject

import poppler

import read_gov_law_proposal as gov
import pdftotext_ext as ext

pdf=poppler.document_new_from_file('file://%s/538.pdf'%os.getcwd(),password=None)

def squares(width, height, n_wide, n_high):
dx = float(width) / n_wide
dy = float(height) / n_high
for j in xrange(n_high):
for i in xrange(n_wide):
yield (dx*i, dy*j, dx,dy)

def enlarging_square_range(start, height, end_width, n):
for i in xrange(n+1):
yield (start[0], start[1], end_width * i/n, height)

def find_middle_at_y(page, start, height, the_end):
rects = [(start[0], start[1], w, height) for w in [0, the_end]]
def getlen((x,y,w,h)):
return len(gov.get_text(page, gov.rect(x, y, w, h)))
vals = [getlen((x,y,w,h)) for x, y, w, h in rects]
min_val, max_val = vals
middle = rects[0]
for i in xrange(10):
if vals[0] == vals[1]:
break
middle = (start[0], start[1], (rects[0][2]+rects[1][2])/2, height)
middle_len = getlen(middle)
if middle_len == vals[1]:
vals[1], rects[1] = middle_len, middle
elif middle_len == vals[0]:
vals[0], rects[0] = middle_len, middle
else:
print "not a normal stretch at iteration %s" % i
return (-1, -1)
#import pdb; pdb.set_trace()
return middle[2], i

def find_column_separation(page):
middles = [find_middle_at_y(page, (0, y)) for y in xrange(0,1000,100)]
return middles

def map_the_desert((width, height), square_to_text, square_iter, text_offset_iter=None):
window, canvas = make_widget()
if text_offset_iter is None:
text_offset_iter = repeat((0,0))
texts = []
for x,y,w,h in square_iter:
dx, dy = text_offset_iter.next()
txt = square_to_text(x,y,w,h)
texts.append(txt)
rect = goocanvas.Rect(x=x+dx,y=y+dy,width=w,height=h)
text_widget = goocanvas.Text(text=len(txt), x=x+w/2+dx,y=y+h/2+dy)
canvas.get_root_item().add_child(rect)
canvas.get_root_item().add_child(text_widget)
return texts

def cover1(page, N=10):
return map_the_desert(page, squares(width, height, N, N))

def stretch(use_ext, page_description, start, height, end_width, N=10):
if use_ext:
filename, page_num = page_description
width, height = get_page(filename, page_num).get_size()
square_to_text = lambda x, y, w, h, filename=filename, page_num=page_num: ext.pdftotext(filename=filename,first=page_num+1, last=page_num+1, x=x, y=y, w=w, h=h)
else:
page = page_description
width, height = page.get_size()
square_to_text = lambda x, y, w, h, page=page: pypoppler_text_from_page(page, x, y, w, h)
return map_the_desert((width, height),
square_to_text,
enlarging_square_range(start, height, end_width, N),
itertools.cycle([(0,-10),(0,10)])
)

def make_widget():
w = gtk.Window()
c = goocanvas.Canvas()
w.add(c)
w.show_all()
return w, c

@@ -0,0 +1,48 @@
"""
We don't require poppler at this time - pdftotext and pdfinfo both require
less dependencies and provide the same or better functionality.
But we might later on (when poppler gets to be a better python wrapper,
since it is using the same library that pdftotext and pdfinfo use, and
will have less overhead). So this code is left here for possible future use.
Alon
"""

import os
import poppler
from textutil import reverse_numbers

def text_from_page(page, x, y, w, h):
return gov.get_text(page, gov.rect(x,y,w,h))

pdf_cache = {}

def get_pdf(filename):
if filename not in pdf_cache:
pdf_cache[filename] = poppler.document_new_from_file('file://%s' % os.path.realpath(filename),password=None)
return pdf_cache[filename]

def get_page(filename, page_num):
return get_pdf(filename).get_page(page_num)

def get_text(page, rect, style=1):
return reverse_numbers(unicode(page.get_text(style=style,rect=rect)))

def rect(x, y, w, h):
rect = poppler.Rectangle()
rect.x1, rect.x2, rect.y1, rect.y2 = x, x+w, y, y+h
return rect

def get_whole_page_text(page):
rect = poppler.Rectangle()
rect.x1, rect.y1 = 0.0, 0.0
rect.x2, rect.y2 = page.get_size()
return get_text(page, rect=rect)

def render_page_to_png(page, filename, width=768, height=1024):
pixbuf=gtk.gdk.Pixbuf(gtk.gdk.COLORSPACE_RGB, True, 8, width, height)
page.render_selection_to_pixbuf(width/page.get_size()[0],rotation=0,pixbuf=pixbuf,selection=rect,old_selection=poppler.Rectangle(),style=1,glyph_color=gtk.gdk.Color('#ffffff'),background_color=gtk.gdk.Color('#000000'))
pixbuf.save(filename,'png');


0 comments on commit 1796233

Please sign in to comment.