forked from drj11/pdftables
-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Paul Furley
committed
Jun 26, 2013
0 parents
commit f9e7e3e
Showing
15 changed files
with
1,314 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
# pdftables - a library for extracting tables from PDF files | ||
|
||
pdftables uses [pdfminer][1] to get information on the locations of text elements in a PDF document. | ||
|
||
First we get a file handle to a PDF: | ||
```python | ||
filepath = os.path.join(PDF_TEST_FILES,SelectedPDF) | ||
fh = open(filepath,'rb') | ||
``` | ||
Then we use our `getPDFPage` function to selection a single page from the document: | ||
```python | ||
pdfPage = getPDFPage(fh, pagenumber) | ||
table,diagnosticData = pageToTables(pdfPage, extend_y = False, hints = hints, atomise = False) | ||
``` | ||
Setting the optional `extend_y` parameter to `True` extends the grid used to extract the table to the full height of the page. | ||
The optional `hints` parameter is a two element string array, the first element should contain unique text at the top of the table, | ||
the second element should contain unique text from the bottom row of the table. | ||
Setting the optional `atomise` parameter to True converts all the text to individual characters this will be slower but will sometimes | ||
split closely separated columns. | ||
|
||
`table` is a list of lists of strings. `diagnosticData` is an object containing diagnostic information which can be displayed using | ||
the `plotpage` function: | ||
|
||
```python | ||
fig,ax1 = plotpage(diagnosticData) | ||
``` | ||
|
||
[1]: http://www.unixuser.org/~euske/python/pdfminer/ |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
#!/usr/bin/env python | ||
# ScraperWiki Limited | ||
# Ian Hopkinson, 2013-06-14 | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
Code to find tables in PDF files | ||
""" | ||
|
||
import os | ||
# import requests | ||
import scraperwiki # pdftoxml does not work on Windows | ||
import lxml.html | ||
import glob | ||
import matplotlib.pyplot as plt | ||
import collections | ||
|
||
|
||
# TODO - Use pdfminer | ||
# TODO | ||
|
||
def pdftoxml(filename,options): | ||
ConverterPath = unicode(r'C:\Users\Ian\BitBucketRepos\0939-AgraInforma\bin\pdftohtml.exe') | ||
directory = os.path.split(filename)[0] | ||
tmpxml = os.path.join(directory,"temph.xml") | ||
if tmpxml in os.listdir('.'): | ||
os.remove(tmpxml) | ||
cmd = '%s -xml %s "%s" %s' % (ConverterPath, options, filename, os.path.splitext(tmpxml)[0]) | ||
|
||
os.system(cmd) | ||
|
||
f = open(tmpxml,'rb') | ||
content = f.read() | ||
f.close() | ||
|
||
return content | ||
|
||
def processpage(page): | ||
left=[] | ||
width=[] | ||
top=[] | ||
right=[] | ||
for textchunk in (page is not None and page.xpath('text')): | ||
thisleft = int(textchunk.attrib.get('left')) | ||
thiswidth = int(textchunk.attrib.get('width')) | ||
left.append(thisleft) | ||
width.append(thiswidth) | ||
top.append(pageheight - int(textchunk.attrib.get('top'))) | ||
right.append(thisleft + thiswidth) | ||
|
||
return pageheight,pagewidth,left,top,right | ||
|
||
def plotpage(pageheight,pagewidth,pagenumber,SelectedPDF,left,top,right): | ||
fig = plt.figure() | ||
ax1 = fig.add_subplot(111) | ||
ax1.axis('equal') | ||
ax1.plot([0,pagewidth,pagewidth,0,0],[0,0,pageheight,pageheight,0]) | ||
ax1.scatter(left, top, s=10, c='b', marker="s") | ||
ax1.scatter(right, top, s=10, c='r', marker="o") | ||
fig.suptitle('%s : Page %d' % (SelectedPDF,pagenumber), fontsize=15) | ||
plt.show() | ||
return fig | ||
|
||
PDF_TEST_FILES = unicode(r'C:\Users\Ian\BitBucketRepos\0939-AgraInforma\fixtures') | ||
|
||
# PDFList = glob.glob(os.path.join(PDF_TEST_FILES,'*.pdf')) | ||
|
||
# SelectedPDF = 6 # 6 = cit0613.pdf - table is actually an image | ||
|
||
# r = requests.get(os.path.join(PDF_TEST_FILES,PDFList[SelectedPDF])) | ||
# options = "" | ||
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,PDFList[SelectedPDF]),options) | ||
|
||
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"cit0613.pdf"),options) # Works but first page is an image | ||
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"2012.01.PosRpt.pdf"),options) # PDF to HTML does not like | ||
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"COPAWEEKLYJUNE52013.pdf"),options) | ||
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"COPAMONTHLYMay2013.pdf"),options) # lxml doesn't like this one, interleaved <b> and <i> tags | ||
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"13_06_12_10_36_58_boletim_ingles_junho_2013.pdf"),options) # Long document with many tables | ||
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"1359397366Final_Coceral grain estimate_2012_December.pdf"),options) | ||
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"ClinicalResearchDisclosureReport2012Q2.pdf"),options) # throws not allowed | ||
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"argentina_diputados_voting_record.pdf"),options) | ||
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"bo_page24.pdf"),options) # Multi-column text and tables mixed on the page | ||
# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"tabla_subsidios.pdf"),options) # Multi-column text and tables mixed on the page | ||
SelectedPDF = "argentina_diputados_voting_record.pdf" | ||
|
||
xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,SelectedPDF),options) | ||
|
||
root = lxml.etree.fromstring(xmldata) | ||
pages = list(root) | ||
|
||
# This is ok but | ||
|
||
|
||
for page in pages: | ||
pagenumber = int(page.attrib.get("number")) | ||
pagewidth = int(page.attrib.get("width")) | ||
pageheight = int(page.attrib.get("height")) | ||
|
||
pageheight,pagewidth,left,top,right = processpage(page) | ||
|
||
fig = plotpage(pageheight,pagewidth,pagenumber,SelectedPDF,left,top,right) | ||
|
||
|
||
# counter=collections.Counter(left) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from pdftables import * |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/usr/bin/env python | ||
from __future__ import unicode_literals | ||
from collections import defaultdict | ||
|
||
|
||
def to_string(table): | ||
""" | ||
Returns a list of the maximum width for each column across all rows | ||
>>> to_string([['foo', 'goodbye'], ['llama', 'bar']]) | ||
None | ||
""" | ||
result = '' | ||
col_widths = find_column_widths(table) | ||
table_width = sum(col_widths) + len(col_widths) + 2 | ||
hbar = ' {}\n'.format('-' * table_width) | ||
result += hbar | ||
for row_index, row in enumerate(table): | ||
cells = [cell.rjust(width, ' ') for (cell, width) | ||
in zip(row, col_widths)] | ||
result += "{:>3}: | {}|\n".format(row_index, '|'.join(cells)) | ||
result += hbar | ||
|
||
return result | ||
|
||
|
||
def find_column_widths(table): | ||
""" | ||
Returns a list of the maximum width for each column across all rows | ||
>>> find_column_widths([['foo', 'goodbye'], ['llama', 'bar']]) | ||
[5, 7] | ||
""" | ||
col_widths = defaultdict(lambda: 0) | ||
for row_index, row in enumerate(table): | ||
for column_index, cell in enumerate(row): | ||
col_widths[column_index] = max(col_widths[column_index], len(cell)) | ||
return [col_widths[col] for col in sorted(col_widths)] | ||
|
||
if __name__ == '__main__': | ||
print(to_string([['foo', 'goodbye'], ['llama', 'bar']])) |
Oops, something went wrong.