Skip to content

Commit

Permalink
Initial commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
Paul Furley committed Jun 26, 2013
0 parents commit f9e7e3e
Show file tree
Hide file tree
Showing 15 changed files with 1,314 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -0,0 +1 @@
*.pyc
28 changes: 28 additions & 0 deletions README.md
@@ -0,0 +1,28 @@
# pdftables - a library for extracting tables from PDF files

pdftables uses [pdfminer][1] to get information on the locations of text elements in a PDF document.

First we get a file handle to a PDF:
```python
filepath = os.path.join(PDF_TEST_FILES,SelectedPDF)
fh = open(filepath,'rb')
```
Then we use our `getPDFPage` function to selection a single page from the document:
```python
pdfPage = getPDFPage(fh, pagenumber)
table,diagnosticData = pageToTables(pdfPage, extend_y = False, hints = hints, atomise = False)
```
Setting the optional `extend_y` parameter to `True` extends the grid used to extract the table to the full height of the page.
The optional `hints` parameter is a two element string array, the first element should contain unique text at the top of the table,
the second element should contain unique text from the bottom row of the table.
Setting the optional `atomise` parameter to True converts all the text to individual characters this will be slower but will sometimes
split closely separated columns.

`table` is a list of lists of strings. `diagnosticData` is an object containing diagnostic information which can be displayed using
the `plotpage` function:

```python
fig,ax1 = plotpage(diagnosticData)
```

[1]: http://www.unixuser.org/~euske/python/pdfminer/
Empty file added fixtures/.keep
Empty file.
104 changes: 104 additions & 0 deletions pdftables/TableFinder.py
@@ -0,0 +1,104 @@
#!/usr/bin/env python
# ScraperWiki Limited
# Ian Hopkinson, 2013-06-14
# -*- coding: utf-8 -*-

"""
Code to find tables in PDF files
"""

import os
# import requests
import scraperwiki # pdftoxml does not work on Windows
import lxml.html
import glob
import matplotlib.pyplot as plt
import collections


# TODO - Use pdfminer
# TODO

def pdftoxml(filename,options):
ConverterPath = unicode(r'C:\Users\Ian\BitBucketRepos\0939-AgraInforma\bin\pdftohtml.exe')
directory = os.path.split(filename)[0]
tmpxml = os.path.join(directory,"temph.xml")
if tmpxml in os.listdir('.'):
os.remove(tmpxml)
cmd = '%s -xml %s "%s" %s' % (ConverterPath, options, filename, os.path.splitext(tmpxml)[0])

os.system(cmd)

f = open(tmpxml,'rb')
content = f.read()
f.close()

return content

def processpage(page):
left=[]
width=[]
top=[]
right=[]
for textchunk in (page is not None and page.xpath('text')):
thisleft = int(textchunk.attrib.get('left'))
thiswidth = int(textchunk.attrib.get('width'))
left.append(thisleft)
width.append(thiswidth)
top.append(pageheight - int(textchunk.attrib.get('top')))
right.append(thisleft + thiswidth)

return pageheight,pagewidth,left,top,right

def plotpage(pageheight,pagewidth,pagenumber,SelectedPDF,left,top,right):
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.axis('equal')
ax1.plot([0,pagewidth,pagewidth,0,0],[0,0,pageheight,pageheight,0])
ax1.scatter(left, top, s=10, c='b', marker="s")
ax1.scatter(right, top, s=10, c='r', marker="o")
fig.suptitle('%s : Page %d' % (SelectedPDF,pagenumber), fontsize=15)
plt.show()
return fig

PDF_TEST_FILES = unicode(r'C:\Users\Ian\BitBucketRepos\0939-AgraInforma\fixtures')

# PDFList = glob.glob(os.path.join(PDF_TEST_FILES,'*.pdf'))

# SelectedPDF = 6 # 6 = cit0613.pdf - table is actually an image

# r = requests.get(os.path.join(PDF_TEST_FILES,PDFList[SelectedPDF]))
# options = ""
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,PDFList[SelectedPDF]),options)

# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"cit0613.pdf"),options) # Works but first page is an image
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"2012.01.PosRpt.pdf"),options) # PDF to HTML does not like
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"COPAWEEKLYJUNE52013.pdf"),options)
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"COPAMONTHLYMay2013.pdf"),options) # lxml doesn't like this one, interleaved <b> and <i> tags
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"13_06_12_10_36_58_boletim_ingles_junho_2013.pdf"),options) # Long document with many tables
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"1359397366Final_Coceral grain estimate_2012_December.pdf"),options)
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"ClinicalResearchDisclosureReport2012Q2.pdf"),options) # throws not allowed
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"argentina_diputados_voting_record.pdf"),options)
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"bo_page24.pdf"),options) # Multi-column text and tables mixed on the page
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"tabla_subsidios.pdf"),options) # Multi-column text and tables mixed on the page
SelectedPDF = "argentina_diputados_voting_record.pdf"

xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,SelectedPDF),options)

root = lxml.etree.fromstring(xmldata)
pages = list(root)

# This is ok but


for page in pages:
pagenumber = int(page.attrib.get("number"))
pagewidth = int(page.attrib.get("width"))
pageheight = int(page.attrib.get("height"))

pageheight,pagewidth,left,top,right = processpage(page)

fig = plotpage(pageheight,pagewidth,pagenumber,SelectedPDF,left,top,right)


# counter=collections.Counter(left)
1 change: 1 addition & 0 deletions pdftables/__init__.py
@@ -0,0 +1 @@
from pdftables import *
39 changes: 39 additions & 0 deletions pdftables/display.py
@@ -0,0 +1,39 @@
#!/usr/bin/env python
from __future__ import unicode_literals
from collections import defaultdict


def to_string(table):
"""
Returns a list of the maximum width for each column across all rows
>>> to_string([['foo', 'goodbye'], ['llama', 'bar']])
None
"""
result = ''
col_widths = find_column_widths(table)
table_width = sum(col_widths) + len(col_widths) + 2
hbar = ' {}\n'.format('-' * table_width)
result += hbar
for row_index, row in enumerate(table):
cells = [cell.rjust(width, ' ') for (cell, width)
in zip(row, col_widths)]
result += "{:>3}: | {}|\n".format(row_index, '|'.join(cells))
result += hbar

return result


def find_column_widths(table):
"""
Returns a list of the maximum width for each column across all rows
>>> find_column_widths([['foo', 'goodbye'], ['llama', 'bar']])
[5, 7]
"""
col_widths = defaultdict(lambda: 0)
for row_index, row in enumerate(table):
for column_index, cell in enumerate(row):
col_widths[column_index] = max(col_widths[column_index], len(cell))
return [col_widths[col] for col in sorted(col_widths)]

if __name__ == '__main__':
print(to_string([['foo', 'goodbye'], ['llama', 'bar']]))

0 comments on commit f9e7e3e

Please sign in to comment.