Initial commit.

okfn · Jun 26, 2013 · f9e7e3e · f9e7e3e
commit f9e7e3e
Show file tree

Hide file tree

Showing 15 changed files with 1,314 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/README.md b/README.md
@@ -0,0 +1,28 @@
+# pdftables - a library for extracting tables from PDF files
+
+pdftables uses [pdfminer][1] to get information on the locations of text elements in a PDF document.
+
+First we get a file handle to a PDF:
+```python
+filepath = os.path.join(PDF_TEST_FILES,SelectedPDF)
+fh = open(filepath,'rb')
+```
+Then we use our `getPDFPage` function to selection a single page from the document:
+```python
+pdfPage = getPDFPage(fh, pagenumber)    
+table,diagnosticData = pageToTables(pdfPage, extend_y = False, hints = hints, atomise = False)
+```
+Setting the optional `extend_y` parameter to `True` extends the grid used to extract the table to the full height of the page.
+The optional `hints` parameter is a two element string array, the first element should contain unique text at the top of the table,
+the second element should contain unique text from the bottom row of the table.
+Setting the optional `atomise` parameter to True converts all the text to individual characters this will be slower but will sometimes
+split closely separated columns.
+
+`table` is a list of lists of strings. `diagnosticData` is an object containing diagnostic information which can be displayed using
+the `plotpage` function:
+
+```python
+fig,ax1 = plotpage(diagnosticData)
+```
+
+[1]: http://www.unixuser.org/~euske/python/pdfminer/
diff --git a/fixtures/.keep b/fixtures/.keep
diff --git a/pdftables/TableFinder.py b/pdftables/TableFinder.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+# ScraperWiki Limited
+# Ian Hopkinson, 2013-06-14
+# -*- coding: utf-8 -*-
+
+"""
+Code to find tables in PDF files
+"""
+
+import os
+# import requests
+import scraperwiki # pdftoxml does not work on Windows
+import lxml.html
+import glob
+import matplotlib.pyplot as plt
+import collections
+
+
+# TODO - Use pdfminer
+# TODO 
+
+def pdftoxml(filename,options):
+    ConverterPath = unicode(r'C:\Users\Ian\BitBucketRepos\0939-AgraInforma\bin\pdftohtml.exe')
+    directory = os.path.split(filename)[0]
+    tmpxml = os.path.join(directory,"temph.xml")
+    if tmpxml in os.listdir('.'):
+        os.remove(tmpxml)
+    cmd = '%s -xml %s "%s" %s' % (ConverterPath, options, filename, os.path.splitext(tmpxml)[0])
+
+    os.system(cmd)
+
+    f = open(tmpxml,'rb')
+    content = f.read()
+    f.close()
+
+    return content 
+
+def processpage(page):
+    left=[]
+    width=[]
+    top=[]
+    right=[]
+    for textchunk in (page is not None and page.xpath('text')):
+        thisleft = int(textchunk.attrib.get('left'))
+        thiswidth = int(textchunk.attrib.get('width'))
+        left.append(thisleft)
+        width.append(thiswidth)
+        top.append(pageheight - int(textchunk.attrib.get('top')))
+        right.append(thisleft + thiswidth)
+
+    return pageheight,pagewidth,left,top,right
+
+def plotpage(pageheight,pagewidth,pagenumber,SelectedPDF,left,top,right):
+    fig = plt.figure()       
+    ax1 = fig.add_subplot(111)
+    ax1.axis('equal')    
+    ax1.plot([0,pagewidth,pagewidth,0,0],[0,0,pageheight,pageheight,0])
+    ax1.scatter(left, top, s=10, c='b', marker="s")
+    ax1.scatter(right, top, s=10, c='r', marker="o")
+    fig.suptitle('%s : Page %d' % (SelectedPDF,pagenumber), fontsize=15)
+    plt.show()    
+    return fig
+
+PDF_TEST_FILES = unicode(r'C:\Users\Ian\BitBucketRepos\0939-AgraInforma\fixtures')
+
+# PDFList = glob.glob(os.path.join(PDF_TEST_FILES,'*.pdf'))
+
+# SelectedPDF = 6 # 6 = cit0613.pdf - table is actually an image
+
+# r = requests.get(os.path.join(PDF_TEST_FILES,PDFList[SelectedPDF]))
+# options = ""
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,PDFList[SelectedPDF]),options)
+
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"cit0613.pdf"),options) # Works but first page is an image
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"2012.01.PosRpt.pdf"),options) # PDF to HTML does not like
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"COPAWEEKLYJUNE52013.pdf"),options)
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"COPAMONTHLYMay2013.pdf"),options) # lxml doesn't like this one, interleaved <b> and <i> tags
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"13_06_12_10_36_58_boletim_ingles_junho_2013.pdf"),options) # Long document with many tables
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"1359397366Final_Coceral grain estimate_2012_December.pdf"),options)
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"ClinicalResearchDisclosureReport2012Q2.pdf"),options) # throws not allowed
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"argentina_diputados_voting_record.pdf"),options)
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"bo_page24.pdf"),options) # Multi-column text and tables mixed on the page
+# xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,"tabla_subsidios.pdf"),options) # Multi-column text and tables mixed on the page
+SelectedPDF = "argentina_diputados_voting_record.pdf"
+
+xmldata = pdftoxml(os.path.join(PDF_TEST_FILES,SelectedPDF),options)
+
+root = lxml.etree.fromstring(xmldata)
+pages = list(root)
+
+# This is ok but 
+
+
+for page in pages:
+    pagenumber = int(page.attrib.get("number"))
+    pagewidth = int(page.attrib.get("width"))
+    pageheight = int(page.attrib.get("height"))
+
+    pageheight,pagewidth,left,top,right = processpage(page)
+
+    fig = plotpage(pageheight,pagewidth,pagenumber,SelectedPDF,left,top,right)
+
+
+    # counter=collections.Counter(left)
diff --git a/pdftables/__init__.py b/pdftables/__init__.py
@@ -0,0 +1 @@
+from pdftables import *
diff --git a/pdftables/display.py b/pdftables/display.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+from __future__ import unicode_literals
+from collections import defaultdict
+
+
+def to_string(table):
+    """
+    Returns a list of the maximum width for each column across all rows
+    >>> to_string([['foo', 'goodbye'], ['llama', 'bar']])
+    None
+    """
+    result = ''
+    col_widths = find_column_widths(table)
+    table_width = sum(col_widths) + len(col_widths) + 2
+    hbar = '     {}\n'.format('-' * table_width)
+    result += hbar
+    for row_index, row in enumerate(table):
+        cells = [cell.rjust(width, ' ') for (cell, width)
+                 in zip(row, col_widths)]
+        result += "{:>3}: | {}|\n".format(row_index, '|'.join(cells))
+    result += hbar
+
+    return result
+
+
+def find_column_widths(table):
+    """
+    Returns a list of the maximum width for each column across all rows
+    >>> find_column_widths([['foo', 'goodbye'], ['llama', 'bar']])
+    [5, 7]
+    """
+    col_widths = defaultdict(lambda: 0)
+    for row_index, row in enumerate(table):
+        for column_index, cell in enumerate(row):
+            col_widths[column_index] = max(col_widths[column_index], len(cell))
+    return [col_widths[col] for col in sorted(col_widths)]
+
+if __name__ == '__main__':
+    print(to_string([['foo', 'goodbye'], ['llama', 'bar']]))