In [1]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTText
from pdfminer.converter import PDFPageAggregator

In [2]:
fname = 'wc2015_Ladies_SP_Scores.pdf'

In [3]:
# LAParams
line_margin=0.2
line_overlap = 0.8
detect_vertical = True
char_margin = 0.01
word_margin = 0.05

laparams = LAParams(word_margin=word_margin)

# laparams = LAParams(line_margin=line_margin,line_overlap=line_overlap,detect_vertical=detect_vertical,
#                    char_margin=char_margin)

In [4]:
fp = open(fname, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
layout = []
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    layout.append(device.get_result())
fp.close()

In [5]:
len(layout)

1

In [6]:
layout = layout[0]

In [7]:
layout_text = []
for l in list(layout):
    if isinstance(l, LTText):
        layout_text.append(l.get_text())
#     else:
#         layout_text.append(str(l))

In [8]:
layout_text

[u'ISU World Figure Skating Championships 2015\nLADIES SHORT PROGRAM         JUDGES DETAILS PER SKATER\n',
 u'Rank  Name\n',
 u' 1\n',
 u'Elizaveta TUKTAMYSHEVA\n',
 u'#  Executed \nElements \n',
 u'3A\n3Lz\n',
 u'1\n2\n3 FSSp3\nLSp4\n4\n5\n3T+3T\n6 StSq4\n7\n',
 u'CCoSp3p4\n',
 u'o\nf\nn\n',
 u'I\n',
 u'Base \nValue \n',
 u' 8.50\n 6.00\n 2.60\n 2.70\n 9.02\n 3.90\n 3.50\n 36.22\n',
 u'x\n',
 u'Program Components \nSkating Skills\nTransition / Linking Footwork\nPerformance / Execution\nChoreography / Composition\nInterpretation\nJudges Total Program Component Score (factored)\n',
 u'Deductions: \n',
 u'  x  Credit for highlight distribution, base value multiplied by 1.1 \n',
 u'Rank  Name\n',
 u' 2\n',
 u'Elena RADIONOVA\n',
 u'#  Executed \nElements \n',
 u'3Lz+3T\n1\n2 FSSp4\n3 StSq4\n4\n5\n6 CCoSp3p4\n7\n',
 u'3Lo\n2A\n',
 u'LSp4\n',
 u'o\nf\nn\n',
 u'I\n',
 u'Base \nValue \n',
 u' 10.10\n 3.00\n 3.90\n 5.61\n 3.63\n 3.50\n 2.70\n 32.44\n',
 u'x\nx\n',
 u'Program Components \nSkati

In [30]:
import pyPdf

def getPDFContent(path):
    content = ""
    num_pages = 1
    p = file(path, "rb")
    pdf = pyPdf.PdfFileReader(p)
    for i in range(0, num_pages):
        content += pdf.getPage(i).extractText() + "\n"
    content = " ".join(content.replace(u"\xa0", " ").strip().split())
    p.close()
    return content

In [9]:
from pdfminer.converter import TextConverter
from cStringIO import StringIO
from pdfminer.converter import LTChar, TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage

class LineConverter(TextConverter):
    def __init__(self, *args, **kwargs):
        TextConverter.__init__(self, *args, **kwargs)

    def end_page(self, i):
        from collections import defaultdict
        lines = defaultdict(lambda: {})
        for child in self.cur_item._objs:  # <-- changed
            if isinstance(child, LTChar):
                (_, _, x, y) = child.bbox
                line = lines[int(-y)]
                line[x] = child._text.encode(self.codec)  # <-- changed
        for y in sorted(lines.keys()):
            line = lines[y]
            self.line_creator(line)
            self.outfp.write(self.line_creator(line))
            self.outfp.write("\n")

    def line_creator(self, line):
        keys = sorted(line.keys())
        # calculate the average distange between each character on this row
        average_distance = sum([keys[i] - keys[i - 1] for i in range(1, len(keys))]) / len(keys)
        # append the first character to the result
        result = [line[keys[0]]]
        for i in range(1, len(keys)):
            result.append(line[keys[i]])
        printable_line = ''.join(result)
        return printable_line

In [86]:
fname = 'gpcan2017_Ladies_FS_Scores.pdf'

In [10]:
rsrc = PDFResourceManager()
outfp = StringIO()
device = LineConverter(rsrc, outfp, codec="utf-8", laparams=LAParams(char_margin=10.0,line_overlap=0.01)) # default line_overlap 0.5
fp = open(fname, 'rb')
interpreter = PDFPageInterpreter(rsrc, device)
for i, page in enumerate(PDFPage.get_pages(fp)):
    if page is not None:
        interpreter.process_page(page)

device.close()
fp.close()
out = outfp.getvalue()
outfp.close()

In [11]:
out.split('\n')

['ISU World Figure Skating Championships 2015',
 'LADIES SHORT PROGRAM         JUDGES DETAILS PER SKATER',
 'StartingTotal Total TotalTotal',
 'Rank NameNationNumberSegmentElementProgram  ComponentDeductions',
 'ScoreScoreScore (factored)',
 ' 1Elizaveta TUKTAMYSHEVARUS 31 77.62 44.09 33.53 0.00',
 'Executed ',
 '# Base GOE The Judges PanelRef Scores',
 'o',
 'Elements fValue (in random order)of Panel',
 'n',
 'I',
 '13A 8.50 1.57221112221 10.07',
 '23Lz 6.00 1.3022232-1122 7.30',
 '3FSSp3 2.60 0.64121211211 3.24',
 '4LSp4 2.70 0.86212122212 3.56',
 '53T+3T 9.02x 1.00211122113 10.02',
 '6StSq4 3.90 1.50223312123 5.40',
 '7CCoSp3p4 3.50 1.00221222322 4.50',
 ' 36.22 44.09',
 'Program Components Factor',
 'Skating Skills 0.808.258.758.258.508.508.757.758.508.50 8.46',
 'Transition / Linking Footwork 0.807.758.008.008.258.257.257.008.507.75 7.89',
 'Performance / Execution 0.808.759.008.508.509.008.507.258.758.75 8.68',
 'Choreography / Composition 0.808.508.508.508.508.258.507.508.508.50

In [67]:
prev = out.split('\n')

In [68]:
prev

['ISU World Figure Skating Championships 2017',
 'MEN SHORT PROGRAM         JUDGES DETAILS PER SKATER',
 'StartingTotal Total TotalTotal',
 'NationNumberSegmentElementProgram  ComponentDeductions',
 'Rank Name',
 'ScoreScoreScore (factored)',
 ' 0.00',
 ' 1Javier FERNANDEZESP 34 109.05 60.79 48.26',
 'Executed Base Scores of ',
 'GOE J1 J2 J3 J4 J5 J7 J8 J9 Ref ',
 '# oJ6 ',
 'Elements Value Panel',
 'f',
 'n',
 'I',
 '14T+3T 14.60 2.86332333332 17.46',
 ' 10.50 2.86333332323',
 '24S 13.36',
 '3FUSp4 2.90 0.362011-10102 3.26',
 '3A 9.35x333333332',
 '4 3.00 12.35',
 '5CCoSp4 3.50 1.00223212222 4.50',
 '6CSSp4 3.00 0.86112122222 3.86',
 '7StSq4 3.90 2.10333333333 6.00',
 ' 47.75 60.79',
 'Program Components Factor ',
 'Skating Skills 1.00 9.25 9.25 9.50 9.50 9.50 9.25 9.50 9.50 9.75 9.43',
 'Transitions 1.00 9.25 9.25 9.50 9.25 9.25 9.25 9.50 9.75 9.50 9.36',
 'Performance 1.00 9.75 9.50 9.75 10.00 9.75 9.50 10.00 10.00 9.75 9.79',
 'Composition 1.00 9.75 9.75 9.75 10.00 9.50 9.25 9.75 