In [76]:
from collections import namedtuple
import csv
from enum import Enum
from lxml import html, etree
import os
import re
import requests
import subprocess

In [55]:
tabula_path = 'tabula-1.0.1-jar-with-dependencies.jar'

In [5]:
class DisciplineType(Enum):
    men = 0
    ladies = 1
    pairs = 2
    ice_dance = 3

In [6]:
class SegmentType(Enum):
    short = 0
    free = 1
    original_dance = 2

In [7]:
Judge = namedtuple('Judge', ['name', 'country'])

In [97]:
def get_fpath(season, event, fname):
    return season.year + '/' + event.name + '/' + fname

In [98]:
def get_page(url, season, event, fname):
    target_dir = season.year + '/' + event.name + '/'
    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    page = requests.get(url)
    with open(target_dir + fname, 'w+') as f:
        f.write(page.content)

In [99]:
class JudgePanel:
    def __init__(self, url, season, event, discipline, segment):
        self.url = url
        self.season = season
        self.event = event
        self.discipline = discipline
        self.segment = segment
        self.fname = segment.name + '_panel.html'
        
    def get_page(self):
        get_page(self.url, self.season, self.event, self.fname)
        
# ['referee', 'technical_controller', 'technical_specialist', 'asst_technical_specialist',
#                          'judges', 'num_judges', 'data_operator', 'replay_operator',
#                          'file_name', 'url'])  # where this panel info came from

In [100]:
Skater = namedtuple('Skater', ['name', 'country', 'discipline'])

In [165]:
Element = namedtuple('Element', ['number',       # order in program
                                 'name',         # e.g. 3A
                                 'info',         # e.g. UR
                                 'base_value',   # float
                                 'bonus',
                                 'goe',          # aggregated over judges
                                 'goes',         # list of individual judge GOEs (-3 to 3)
                                 'points'])      # total points for element

In [102]:
ProgramComponent = namedtuple('ProgramComponent', ['name', 'factor', 'scores'])

In [119]:
class Segment:
    def __init__(self, url, season, event, discipline, segment):
        self.url = url  # url to scores
        self.season = season
        self.event = event
        self.discipline = discipline
        self.type = segment
        self.name = discipline.name + '_' + segment.name
        
        self.fname = discipline.name + '_' + segment.name
        self.pdf_fname = self.fname + '.pdf'
        self.csv_fname = self.fname + '.csv'
        self.fpath = get_fpath(season, event, self.pdf_fname)
        self.csv_path = get_fpath(season, event, self.csv_fname)
#         self.scorecards
        self.panel = None

    def __repr__(self):
        return self.event.name + ' ' + self.name

    def get_page(self):
        get_page(self.url, self.season, self.event, self.pdf_fname)
    
    def print_csv(self):
        with open(self.csv_path, 'rb') as f:
            reader = csv.reader(f)
            for row in reader:
                print ' '.join(row)

In [188]:
points = '(\d\d?\d?.\d\d)'
skater_re = re.compile('(\d+)\s*' +        # rank
                       '(\D+ \D+)\s*' +    # skater name
                       '([A-Z][A-Z][A-Z])\s*' +  # country
                       '(\d+)\s*' +        # starting number
                       points + '\s*' +    # total score
                       points + '\s*' +    # tes
                       points + '\s*' +    # pcs
                       '(-?\d.\d\d)')      # deductions
elt_re = re.compile('(\d)\s*' +         # element order
                    '(\S+)\s*' +        # element name
                    '(\D*)\s*' +        # info (i.e. UR)
                    points + '\s*' +    # base value
                    '(x?)\s*' +         # bonus marker
                    '(-?\d.\d\d)\s*' +  # goe
                    '((?:-?\d\s*)+|-)\s+' +  # goes
                    '(\d?\d.\d\d)')     # element score
tes_re = re.compile('^' + points + '\s*' +    # total base value
                    points + '\s*$')     # total tes

In [169]:
nhk15 = seas1516.events[5]
nhk15_men = nhk15.disciplines[0]
nhk15_men_short = seas1516.events[5].disciplines[0].segments[0]

In [177]:
rows = rows

In [189]:
skater = None
scorecard = None
for line in rows:
    line = line.strip()
    
    skater_match = skater_re.match(line)
    if skater_match and skater:
        break  # just look at the first skater for now
    if skater_match:
        skater_info = {}
        
        for i, info in enumerate(('rank', 'name', 'country', 'starting_number', 'total_score', 'tes', 'pcs', 'deductions')):
            skater_info[info] = skater_match.group(i + 1)
        skater = Skater(skater_info['name'], skater_info['country'], 'men')
        scorecard = Scorecard(nhk15_men_short.url, seas1516, nhk15, nhk15_men, nhk15_men_short, skater,
                              skater_info['rank'], skater_info['starting_number'],
                              skater_info['total_score'], skater_info['tes'], skater_info['pcs'], skater_info['deductions'])
        continue
        
    elt_match = elt_re.match(line)
    print line
    if elt_match:
        elt_info = []
        for i, info in enumerate(('elt_number', 'name', 'info', 'base_value', 'bonus', 'goe', 'goes', 'elt_score')):
            elt_info.append(elt_match.group(i+1))
        print elt_info
        elt = Element._make(elt_info)
        scorecard.elements.append(elt)
    
    if scorecard and scorecard.elements and tes_re.match(line):
        scorecard.aggregate_elements()
        match = tes_re.match(line)
#         assert float(match.group(1)) == 
#         assert float(match.group(2))

#     if 'Program' in line and 'Components' in line:
#         collect_pcs = True
#     if scorecard and len(scorecard.)

ISU GP NHK Trophy 2015
MEN SHORT PROGRAMJUDGES DETAILS PER SKATER
Starting Total Total  Total Total
Rank Name Nation Number Segment Element  Program  Component Deductions
Score Score  Score (factored)
# Executed Base GOE The Judges Panel   Ref Scores
Elements Value (in random order)   of Panel
1 4S  10.50  1.00 1 2 1 1 1 2 0  0 1 11.50
['1', '4S', '', '10.50', '', '1.00', '1 2 1 1 1 2 0  0 1', '11.50']
2 4T+3T  14.60  2.57 2 3 2 3 3 3 2  2 3 17.17
['2', '4T+3T', '', '14.60', '', '2.57', '2 3 2 3 3 3 2  2 3', '17.17']
3 FCSp4  3.20  1.07 2 3 2 2 3 2 2  2 2 4.27
['3', 'FCSp4', '', '3.20', '', '1.07', '2 3 2 2 3 2 2  2 2', '4.27']
4 3A  9.35 x  2.43 2 3 2 2 3 3 3  2 2 11.78
['4', '3A', '', '9.35', 'x', '2.43', '2 3 2 2 3 3 3  2 2', '11.78']
Info   Info  Info
5 CSSp4  3.00  1.21 3 2 3 2 3 2 3  2 2 4.21
['5', 'CSSp4', '', '3.00', '', '1.21', '3 2 3 2 3 2 3  2 2', '4.21']
6 StSq4  3.90  1.90 3 3 2 3 3 3 2  3 2 5.80
['6', 'StSq4', '', '3.90', '', '1.90', '3 3 2 3 3 3 2  3 2', '5.80']
7 CCoSp3

In [185]:
scorecard.elements

59.44


In [140]:
class Scorecard:
    def __init__(self, url, season, event, discipline, segment, skater,
                 rank, starting_number, total_score, tes, pcs, deductions):
        self.url = url
        self.season = season
        self.event = event
        self.discipline = discipline
        self.segment = segment
        self.skater = skater
        self.rank = int(rank)
        self.starting_number = int(starting_number)
        self.tes = float(tes)
        self.pcs = float(pcs)
        self.total_deductions = float(deductions)
        
        self.elements = []
        self.components = []
        self.deductions = {}
        
#     def add_element(self, element):
#         self.elements.append
#         self.deductions # maps deduction text to point value
#         self.elements   # list of Elements
#         self.components        # list of Program Components
        
#         self.page_number

In [108]:
class Discipline:
    def __init__(self, season, event, discipline):
        self.discipline = discipline  # of type DisciplineType
        self.season = season
        self.event = event
        self.segments = []  # list of Segments
        self.entries = []
        self.entries_url = None
        self.entries_fname = self.discipline.name + '_entries.html'
        self.results = []
        self.results_url = None
        self.results_fname = self.discipline.name + '_results.html'
    
    def __repr__(self):
        return self.event.name + ' ' + self.discipline.name
    
    def get_entries_results_pages(self):
        assert self.entries_url and self.results_url
        get_page(self.entries_url, self.season, self.event, self.entries_fname)
        get_page(self.results_url, self.season, self.event, self.results_fname)

In [115]:
class Event:
    def __init__(self, season, name):
        self.name = name      # the abbreviation + year
        self.season = season  # season object
        self.url = season.url + name + '/'
        self.dirpath = season.year + '/' + self.name + '/'
    
    def __repr__(self):
        return self.name

    def pdfs_to_csvs(self):
        subprocess.Popen('java -jar ' + tabula_path + ' -p all -b ' + self.dirpath, shell=True)

    def get_event_info(self, fetch_files=False):
        page = requests.get(self.url)
        try:
            tree = html.fromstring(page.content)
        except etree.ParserError:
            print 'Event not found: % s' % event
            return

        if not list(tree.iter('title')):
            print 'Event not found: % s' % event
            return
        else:
            print list(tree.iter('title'))[0].text

        # Drill down to the table part.
        table = tree.getchildren()[1].getchildren()[2].getchildren()
        elts = self._flatten_html_children(table)
        elts = self._flatten_html_children(elts)
        elts = self._flatten_html_children(elts)
        elts = self._flatten_html_children(elts)
        
        # 12 rows of actual table: cut out header and juniors
        rows = elts[2].getchildren()[0].getchildren()[1:13]

        self.disciplines = []
        for i, discipline_type in enumerate(list(DisciplineType)):
            
            discipline = Discipline(self.season, self, discipline_type)
            
            # Get entries and results pages.
            entries, results = self._flatten_html_children(rows[i * 3])
            discipline.entries_url = self.url + entries.attrib['href']
            discipline.results_url = self.url + results.attrib['href']
            if fetch_files:
                discipline.get_entries_results_pages()
            
            # Get segment files.
            for prog in (SegmentType.short, SegmentType.free):
                j = prog.value
                panel, detailed, scorecards = self._flatten_html_children(rows[i * 3 + j + 1])
                segment = Segment(self.url + scorecards.attrib['href'], self.season, self, discipline_type, prog)
                
                if fetch_files:
                    segment.get_page()
                discipline.segments.append(segment)

                panel = JudgePanel(self.url + panel.attrib['href'], self.season, self, discipline_type, segment)                
                if fetch_files:
                    panel.get_page()
            self.disciplines.append(discipline)
                
    def _flatten_html_children(self, elts):
        children = []
        for elt in elts:
            for child in elt.getchildren():
                children.append(child)
        return children

In [120]:
seas1415 = Season ('2015')
seas1516 = Season('2016')
for seas in (seas1415, seas1516):
    for event in seas.events:
        if event.name == 'gpfra2015':
            print 'GP France 2015 was cancelled partway through.'
            continue
        event.get_event_info()

ISU GP 2014 Hilton HHonors Skate America
ISU GP Skate Canada International 2014
ISU GP Lexus Cup of China 2014
ISU GP Rostelecom Cup 2014
ISU GP Trophee Eric Bompard 2014
ISU GP NHK Trophy 2014
ISU Grand Prix of Figure Skating Final 2014
ISU European Figure Skating Championships 2015
ISU Four Continents Figure Skating Championships
ISU World Figure Skating Championships 2015
ISU GP 2015 Progressive Skate America
ISU GP Skate Canada International 2015
ISU GP Audi Cup of China 2015
ISU GP Rostelecom Cup 2015
GP France 2015 was cancelled partway through.
ISU GP NHK Trophy 2015
ISU Grand Prix of Figure Skating Final
ISU European Figure Skating Championships 2016
ISU Four Continents Figure Skating Championships
ISU World Figure Skating Championships 2016


In [73]:
class Season:
    def __init__(self, spring_str_rep):
        champ_year = spring_str_rep
        gp_year = str(int(spring_str_rep) - 1)
        two_digit_season = int(spring_str_rep[2:])
        if two_digit_season <= 9:
            twotwo_year = '0' + str(two_digit_season - 1) + '0' + str(two_digit_season)
        elif two_digit_season <= 10:
            twotwo_year = '0' + str(two_digit_season - 1) + str(two_digit_season)
        else:
            twotwo_year = str(two_digit_season - 1) + str(two_digit_season)
        gpf_year = twotwo_year
        
        if int(spring_str_rep) % 4 == 2:
            champs = ['ec', 'fc', 'owg', 'wc']
        else:
            champs = ['ec', 'fc', 'wc']
        gps = ['usa', 'can', 'chn', 'rus', 'fra', 'jpn']
        gps = ['gp' + gp for gp in gps]
        
        self.year = twotwo_year
        
        event_names = [gp + gp_year for gp in gps]
        event_names.append('gpf' + twotwo_year)
        event_names += [champ + champ_year for champ in champs]
        
        self.url = 'http://www.isuresults.com/results/'
        if int(spring_str_rep) >= 2016:
            self.url += 'season'+ twotwo_year + '/'

        self.events = [Event(self, event_name) for event_name in event_names]

    def __repr__(self):
        return 'Season: \'' + self.year[:2] + '-\'' + self.year[2:]

In [165]:
def list_children(elts):
    children = []
    for elt in elts:
        for child in elt.getchildren():
            children.append(child)
    return children