In [36]:
from collections import namedtuple
from enum import Enum
from lxml import html, etree
import os
import re
import requests
import subprocess

In [3]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage, PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams, LTText
from pdfminer.converter import PDFPageAggregator, LTChar, TextConverter
from cStringIO import StringIO

In [55]:
tabula_path = 'tabula-1.0.1-jar-with-dependencies.jar'

In [4]:
class LineConverter(TextConverter):
    def __init__(self, *args, **kwargs):
        TextConverter.__init__(self, *args, **kwargs)

    def end_page(self, i):
        from collections import defaultdict
        lines = defaultdict(lambda: {})
        for child in self.cur_item._objs:  # <-- changed
            if isinstance(child, LTChar):
                (_, _, x, y) = child.bbox
                line = lines[int(-y)]
                line[x] = child._text.encode(self.codec)  # <-- changed
        for y in sorted(lines.keys()):
            line = lines[y]
            self.line_creator(line)
            self.outfp.write(self.line_creator(line))
            self.outfp.write("\n")

    def line_creator(self, line):
        keys = sorted(line.keys())
        # calculate the average distange between each character on this row
        average_distance = sum([keys[i] - keys[i - 1] for i in range(1, len(keys))]) / len(keys)
        # append the first character to the result
        result = [line[keys[0]]]
        for i in range(1, len(keys)):
            result.append(line[keys[i]])
        printable_line = ''.join(result)
        return printable_line

In [5]:
class DisciplineType(Enum):
    men = 0
    ladies = 1
    pairs = 2
    ice_dance = 3

In [6]:
class SegmentType(Enum):
    short = 0
    free = 1
    original_dance = 2

In [7]:
Judge = namedtuple('Judge', ['name', 'country'])

In [57]:
def get_fpath(season, event, fname, ftype = 'pdf'):
    return season.year + '/' + event.name + '/' + ftype + '/' + fname

In [58]:
def get_page(url, season, event, subdir, fname):
    target_dir = season.year + '/' + event.name + '/' + subdir + '/'
    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    page = requests.get(url)
    with open(target_dir + fname, 'w+') as f:
        f.write(page.content)

In [10]:
class JudgePanel:
    def __init__(self, url, season, event, discipline, segment):
        self.url = url
        self.season = season
        self.event = event
        self.discipline = discipline
        self.segment = segment
        self.fname = segment.name + '_panel.html'
        
    def get_page(self):
        get_page(self.url, self.season, self.event, 'html', self.fname)
        
# ['referee', 'technical_controller', 'technical_specialist', 'asst_technical_specialist',
#                          'judges', 'num_judges', 'data_operator', 'replay_operator',
#                          'file_name', 'url'])  # where this panel info came from

In [11]:
Skater = namedtuple('Skater', ['name', 'country', 'discipline'])

In [12]:
Element = namedtuple('Element', ['name',  # string, e.g. 3A
                                 'base_value',
                                 'info',  # e.g. UR, edge call, REP
                                 'goes']) # list

In [13]:
ProgramComponent = namedtuple('ProgramComponent', ['name', 'factor', 'scores'])

In [14]:
class Scorecard:
    def __init__(self, url, season, event, discipline, segment, skater):
        self.url = url
        self.season = season
        self.event = event
        self.discipline = discipline
        self.segment = segment
        self.skater = skater
    
#         self.rank
#         self.skater  # type Skater
#         self.start_number  # skate order
#         self.total_score
#         self.tes
#         self.pcs
#         self.deductions # maps deduction text to point value
#         self.elements   # list of Elements
#         self.pcs        # list of Program Components
        
#         self.page_number

In [53]:
class Segment:
    def __init__(self, url, season, event, discipline, segment):
        self.url = url  # url to scores
        self.season = season
        self.event = event
        self.discipline = discipline
        self.type = segment
        self.name = discipline.name + '_' + segment.name
        
        self.fname = discipline.name + '_' + segment.name + '.pdf'
        self.fpath = get_fpath(season, event, self.fname)
#         self.scorecards
        self.panel = None

    def __repr__(self):
        return self.event.name + ' ' + self.name

    def get_page(self):
        get_page(self.url, self.season, self.event, 'pdf', self.fname)
        
    def parse_pdf_tabula(self):
        print 'java -jar ' + tabula_path + ' ' + self.fpath
        subprocess.Popen('java -jar ' + tabula_path + ' ' + self.fpath)

    def parse_pdf_plain(self):
        if not os.path.exists(self.fpath):
            self.get_page()
        
        # LAParams
        line_margin=0.2
        line_overlap = 0.8
        detect_vertical = True
        char_margin = 0.01
        word_margin = 0.05
        laparams = LAParams(word_margin=word_margin)
        
        with open(self.fpath, 'rb') as f:
            parser = PDFParser(f)
            document = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            pages = []
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                pages.append(device.get_result())
        
        layout = pages[0]
        layout_text = []
        for l in list(layout):
            if isinstance(l, LTText):
                layout_text.append(l.get_text())
        print layout_text
        
    def parse_pdf_text(self):
        if not os.path.exists(self.fpath):
            self.get_page()
        rsrc = PDFResourceManager()
        outfp = StringIO()
        device = LineConverter(rsrc, outfp, codec="utf-8", laparams=LAParams(char_margin=10.0,line_overlap=0.01)) # default line_overlap 0.5
        
        with open(self.fpath, 'rb') as f:
            interpreter = PDFPageInterpreter(rsrc, device)
            for i, page in enumerate(PDFPage.get_pages(f)):
                if page is not None:
                    interpreter.process_page(page)

        device.close()
        out = outfp.getvalue()
        outfp.close()
        return [scorecard.split('\n') for scorecard in out.split('Rank')]

In [95]:
skater_re = re.compile('\s*(\d+)\s*(\D+ \D+)\s*([A-Z][A-Z][A-Z])\s+(\d+)\s+' +
                       '(\d\d?\d?.\d\d)\s+(\d\d?\d?.\d\d)\s+(\d\d?\d?.\d\d)\s*(-?\d.\d\d)')
elt_re = re.compile('(\d)(\S+)\s+(\d\d?.\d\d)\s+(-?\d.\d\d)\s*((-?\d)+)\s+(\d?\d.\d\d)')

In [35]:
import csv
sa14 = seas1415.events[0]
mens_short = sa14.disciplines[0].segments[0]
with open('1415/gpusa2014/pdf/men_short.csv', 'rb') as f:
    reader = csv.reader(f)
    for row in reader:
        print ' '.join(row)

ISU GP 2014 Hilton HHonors Skate America     
MEN SHORT PROGRAMJUDGES DETAILS PER SKATER     
Starting Total Total  Total Total 
Rank Name Nation Number Segment Element  Program  Component Deductions 
 Score Score  Score (factored)  
1 Tatsuki MACHIDA JPN  12 93.39  49.65  43.74 0.00 
# Executed Base GOE The Judges Panel   Ref Scores 
Elements Value (in random order)   of Panel 
1 4T+3T  14.40  1.29 1 1 2 1 2 1 0  1 2 15.69 
2 3A  8.50  1.86 2 1 1 2 2 2 2  3 2 10.36 
3 CCSp3  2.80  0.57 1 2 1 1 1 2 1  0 1 3.37 
4 FSSp4  3.00  0.79 2 2 1 2 2 1 2  1 1 3.79 
Info   Info  Info
5 StSq3  3.30  1.00 2 2 2 2 2 3 2  2 1 4.30 
6 3Lz  6.60 x  1.40 2 1 2 2 2 2 2  2 2 8.00 
7 CCoSp3p4  3.50  0.64 1 2 1 1 2 1 1  1 2 4.14 
42.10    49.65 
Program Components Factor     
Skating Skills  1.00 8.75 9.00 8.75 8.25 8.75 9.00 8.75  8.25 8.75 8.71 
Transition / Linking Footwork  1.00 8.50 9.00 8.75 8.00 8.75 8.75 8.50  8.25 9.00 8.64 
Performance / Execution  1.00 9.00 8.75 9.00 8.75 9.00 9.00 9.00  8.50 8.7

In [97]:
sa14 = seas1415.events[0]
sa14.get_event_info()
mens_short = sa14.disciplines[0].segments[0]
out = mens_short.parse_pdf_text()
matches = []

entry = out[1]
# if 'Name' not in entry[0]:
#     pass
for line in entry:
    skater_match = skater_re.match(line)
    if skater_match:
        skater_info = {}
        for i, info in enumerate(('rank', 'name', 'country', 'starting_number', 'total_score', 'tes', 'pcs', 'deductions')):
            skater_info[info] = skater_match.group(i + 1)
        skater = Skater(skater_info['name'], skater_info['country'], 'men')
        scorecard = Scorecard('sth', seas1415, sa14, sa14.disciplines[0], mens_short, skater)
    
    elt_match = elt_re.match(line)
    if elt_match:
        elt_info = {}
        for i, info in enumerate(('elt_number', 'name', 'base_value', 'goe', 'goes', 'elt_score')):
            elt_info[info] = elt_match.group(i+1)
        elt = Element(elt_info['name'], elt_info['base_value'], )
        
        
        Element = namedtuple('Element', ['name',  # string, e.g. 3A
                                 'base_value',
                                 'info',  # e.g. UR, edge call, REP
                                 'goes']) # list

ISU GP 2014 Hilton HHonors Skate America
{'name': '4T+3T', 'goe': '1.29', 'goes': '112121012', 'elt_score': '2', 'elt_number': '1', 'base_value': '14.40'}
{'name': '3A', 'goe': '1.86', 'goes': '211222232', 'elt_score': '2', 'elt_number': '2', 'base_value': '8.50'}
{'name': 'CCSp3', 'goe': '0.57', 'goes': '121112101', 'elt_score': '1', 'elt_number': '3', 'base_value': '2.80'}
{'name': 'FSSp4', 'goe': '0.79', 'goes': '221221211', 'elt_score': '1', 'elt_number': '4', 'base_value': '3.00'}
{'name': 'StSq3', 'goe': '1.00', 'goes': '222223221', 'elt_score': '1', 'elt_number': '5', 'base_value': '3.30'}
{'name': 'CCoSp3p4', 'goe': '0.64', 'goes': '121121112', 'elt_score': '2', 'elt_number': '7', 'base_value': '3.50'}


In [1]:
class Discipline:
    def __init__(self, season, event, discipline):
        self.discipline = discipline  # of type DisciplineType
        self.season = season
        self.event = event
        self.segments = []  # list of Segments
        self.entries = []
        self.entries_url = None
        self.entries_fname = self.discipline.name + '_entries.html'
        self.results = []
        self.results_url = None
        self.results_fname = self.discipline.name + '_results.html'
    
    def __repr__(self):
        return self.event.name + ' ' + self.discipline.name
    
    def get_entries_results_pages(self):
        assert self.entries_url and self.results_url
        get_page(self.entries_url, self.season, self.event, 'html', self.entries_fname)
        get_page(self.results_url, self.season, self.event, 'html', self.results_fname)

In [48]:
class Event:
    def __init__(self, season, name):
        self.name = name      # the abbreviation + year
        self.season = season  # season object
        self.url = season.url + name + '/'
    
    def __repr__(self):
        return self.name

    def pdfs_to_csvs(self):
        for discipline in self.disciplines:
            for segment in discipline.segments:
                segment.parse_pdf_tabula()
    
    def get_event_info(self, fetch_files=False):
        page = requests.get(self.url)
        try:
            tree = html.fromstring(page.content)
        except etree.ParserError:
            print 'Event not found: % s' % event
            return

        if not list(tree.iter('title')):
            print 'Event not found: % s' % event
            return
        else:
            print list(tree.iter('title'))[0].text
        
        # Drill down to the table part.
        table = tree.getchildren()[1].getchildren()[2].getchildren()
        elts = self._flatten_html_children(table)
        elts = self._flatten_html_children(elts)
        elts = self._flatten_html_children(elts)
        elts = self._flatten_html_children(elts)
        
        # 12 rows of actual table: cut out header and juniors
        rows = elts[2].getchildren()[0].getchildren()[1:13]

        self.disciplines = []
        for i, discipline_type in enumerate(list(DisciplineType)):
            
            discipline = Discipline(self.season, self, discipline_type)
            
            # Get entries and results pages.
            entries, results = self._flatten_html_children(rows[i * 3])
            discipline.entries_url = self.url + entries.attrib['href']
            discipline.results_url = self.url + results.attrib['href']
            if fetch_files:
                discipline.get_entries_results_pages()
            
            # Get segment files.
            for prog in (SegmentType.short, SegmentType.free):
                j = prog.value
                panel, detailed, scorecards = self._flatten_html_children(rows[i * 3 + j + 1])
                segment = Segment(self.url + scorecards.attrib['href'], self.season, self, discipline_type, prog)
                
                if fetch_files:
                    segment.get_page()
                discipline.segments.append(segment)

                panel = JudgePanel(self.url + panel.attrib['href'], self.season, self, discipline_type, segment)                
                if fetch_files:
                    panel.get_page()
            self.disciplines.append(discipline)
                
                
    def _flatten_html_children(self, elts):
        children = []
        for elt in elts:
            for child in elt.getchildren():
                children.append(child)
        return children

In [60]:
nhk15 = seas1516.events[5]
nhk15.pdfs_to_csvs()

java -jar tabula-1.0.1-jar-with-dependencies.jar 1516/gpjpn2015/pdf/men_short.pdf


OSError: [Errno 2] No such file or directory

In [59]:
seas1415 = Season ('2015')
seas1516 = Season('2016')
for seas in (seas1415, seas1516):
    for event in seas.events:
        if event.name == 'gpfra2015':
            print 'GP France 2015 was cancelled partway through.'
            continue
        event.get_event_info()

ISU GP 2014 Hilton HHonors Skate America
ISU GP Skate Canada International 2014
ISU GP Lexus Cup of China 2014
ISU GP Rostelecom Cup 2014
ISU GP Trophee Eric Bompard 2014
ISU GP NHK Trophy 2014
ISU Grand Prix of Figure Skating Final 2014
ISU European Figure Skating Championships 2015
ISU Four Continents Figure Skating Championships
ISU World Figure Skating Championships 2015
ISU GP 2015 Progressive Skate America
ISU GP Skate Canada International 2015
ISU GP Audi Cup of China 2015
ISU GP Rostelecom Cup 2015
GP France 2015 was cancelled partway through.
ISU GP NHK Trophy 2015
ISU Grand Prix of Figure Skating Final
ISU European Figure Skating Championships 2016
ISU Four Continents Figure Skating Championships
Event not found: owg2016
ISU World Figure Skating Championships 2016


In [18]:
class Season:
    def __init__(self, spring_str_rep):
        champ_year = spring_str_rep
        gp_year = str(int(spring_str_rep) - 1)
        two_digit_season = int(spring_str_rep[2:])
        if two_digit_season <= 9:
            twotwo_year = '0' + str(two_digit_season - 1) + '0' + str(two_digit_season)
        elif two_digit_season <= 10:
            twotwo_year = '0' + str(two_digit_season - 1) + str(two_digit_season)
        else:
            twotwo_year = str(two_digit_season - 1) + str(two_digit_season)
        gpf_year = twotwo_year
        
        if int(spring_str_rep) % 4 == 0:
            champs = ['ec', 'fc', 'owg', 'wc']
        else:
            champs = ['ec', 'fc', 'wc']
        gps = ['usa', 'can', 'chn', 'rus', 'fra', 'jpn']
        gps = ['gp' + gp for gp in gps]
        
        self.year = twotwo_year
        
        event_names = [gp + gp_year for gp in gps]
        event_names.append('gpf' + twotwo_year)
        event_names += [champ + champ_year for champ in champs]
        
        self.url = 'http://www.isuresults.com/results/'
        if int(spring_str_rep) >= 2016:
            self.url += 'season'+ twotwo_year + '/'

        self.events = [Event(self, event_name) for event_name in event_names]

    def __repr__(self):
        return 'Season: \'' + self.year[:2] + '-\'' + self.year[2:]

In [165]:
def list_children(elts):
    children = []
    for elt in elts:
        for child in elt.getchildren():
            children.append(child)
    return children