In [1]:
from collections import namedtuple
import csv
from enum import Enum
from lxml import html, etree
import os
import re
import requests
import subprocess

In [2]:
tabula_path = 'tabula-1.0.1-jar-with-dependencies.jar'

In [3]:
class DisciplineType(Enum):
    men = 0
    ladies = 1
    pairs = 2
    ice_dance = 3

In [4]:
class SegmentType(Enum):
    short = 0
    free = 1
    original_dance = 2

In [5]:
Judge = namedtuple('Judge', ['name', 'country'])

In [6]:
def get_fpath(season, event, fname):
    return season.year + '/' + event.name + '/' + fname

In [7]:
def get_page(url, season, event, fname):
    target_dir = season.year + '/' + event.name + '/'
    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    page = requests.get(url)
    with open(target_dir + fname, 'w+') as f:
        f.write(page.content)

In [8]:
class JudgePanel:
    def __init__(self, url, season, event, discipline, segment):
        self.url = url
        self.season = season
        self.event = event
        self.discipline = discipline
        self.segment = segment
        self.fname = segment.name + '_panel.html'
        
    def get_page(self):
        get_page(self.url, self.season, self.event, self.fname)
        
# ['referee', 'technical_controller', 'technical_specialist', 'asst_technical_specialist',
#                          'judges', 'num_judges', 'data_operator', 'replay_operator',
#                          'file_name', 'url'])  # where this panel info came from

In [9]:
Skater = namedtuple('Skater', ['name', 'country', 'discipline'])

In [10]:
Element = namedtuple('Element', ['number',       # order in program
                                 'name',         # e.g. 3A
                                 'info',         # e.g. UR
                                 'base_value',   # float
                                 'bonus',
                                 'goe',          # aggregated over judges
                                 'goes',         # list of individual judge GOEs (-3 to 3)
                                 'points'])      # total points for element

In [11]:
ProgramComponent = namedtuple('ProgramComponent', ['name',    # e.g. 'Skating Skills'
                                                   'factor',  # e.g. 1.0 in men's short, 2.0 in men's free
                                                   'scores',  # list of individual judge's scores
                                                   'points']) # aggregated judge's scores multiplied by factor

In [41]:
class Segment:
    def __init__(self, url, season, event, discipline, segment):
        self.url = url  # url to scores
        self.season = season
        self.event = event
        self.discipline = discipline
        self.type = segment
        self.name = discipline.name + '_' + segment.name
        
        self.fname = discipline.name + '_' + segment.name
        self.pdf_fname = self.fname + '.pdf'
        self.csv_fname = self.fname + '.csv'
        self.fpath = get_fpath(season, event, self.pdf_fname)
        self.csv_path = get_fpath(season, event, self.csv_fname)
#         self.scorecards
        self.panel = None
    
        self.scorecards = []

    def __repr__(self):
        return self.event.name + ' ' + self.name

    def get_page(self):
        get_page(self.url, self.season, self.event, self.pdf_fname)
    
    def print_csv(self):
        with open(self.csv_path, 'rb') as f:
            reader = csv.reader(f)
            for row in reader:
                print ' '.join(row)
    
    def get_csv_rows(self):
        rows = []
        with open(self.csv_path, 'rb') as f:
            reader = csv.reader(f)
            for row in reader:
                rows.append(''.join(row))
        return row

In [37]:
print ('(\d)\s*' +         # element order
                    '(\S+)\s*' +        # element name
                    '(\D*)\s*' +        # info (i.e. UR)
                    points + '\s*' +    # base value
                    '(x?)\s*' +         # bonus marker
                    '(-?\d.\d\d)\s*' +  # goe
                    '((?:-?\d\s*|-){' + num_judges + '})\s+' +  # goes
                    '(\d?\d.\d\d)')

(\d)\s*(\S+)\s*(\D*)\s*(\d\d?\d?.\d\d)\s*(x?)\s*(-?\d.\d\d)\s*((?:-?\d\s*|-){9})\s+(\d?\d.\d\d)


In [98]:
num_judges = '9'
points = '(\d\d?\d?.\d\d)'
skater_re = re.compile('(\d+)\s*' +        # rank
                       '(\D+ \D+?)\s*' +    # skater name
                       '([A-Z][A-Z][A-Z])\s*' +  # country
                       '(\d\d?)\s*' +        # starting number
                       '(\d\d\d?.\d\d)\s*' +    # total score
                       points + '\s*' +    # tes
                       points + '\s*' +    # pcs
                       '(-?\d.\d\d)')      # deductions
elt_re = re.compile('(\d)\s*' +         # element order
                    '(\S+)\s*' +        # element name
                    '(\D*?)\s*' +        # info (i.e. UR)
                    points + '\s*' +    # base value
                    '(x?)\s*' +         # bonus marker
                    '(-?\d.\d\d)\s*' +  # goe
                    '((?:-?\d\s*|-){' + num_judges + '})\s*' +  # goes
                    '(\d?\d.\d\d)')     # element score
tes_re = re.compile('^' + points + '\s*' +    # total base value
                    points + '\s*$')     # total tes
component_re = re.compile('(\D+?)\s*' +   # component name
                          '(\d.\d\d)\s*' + # factor
                          '((?:\d?\d.\d\d\s*){' + num_judges + '})\s*' +  # judges marks
                          '(\d?\d.\d\d)') # aggregated judges marks
pcs_re = re.compile('\D+\s+Program\s+Component\s+\D+\s+' + points)

In [20]:
nhk15 = seas1516.events[5]
nhk15_men = nhk15.disciplines[0]
nhk15_men_short = seas1516.events[5].disciplines[0].segments[0]

In [58]:
rows

['ISU GP NHK Trophy 2015',
 'MEN SHORT PROGRAMJUDGES DETAILS PER SKATER',
 'StartingTotal TotalTotalTotal',
 'Rank Name Nation NumberSegment ElementProgram  ComponentDeductions',
 'Score ScoreScore (factored)',
 '1 Yuzuru HANYU JPN  12106.33  59.4446.890.00',
 '# Executed Base GOEThe Judges PanelRef Scores',
 'Elements Value(in random order)of Panel',
 '1 4S  10.50  1.00 1 2 11 1 2 00 111.50',
 '2 4T+3T  14.60  2.57 2 3 23 3 3 22 317.17',
 '3 FCSp4  3.20  1.07 2 3 22 3 2 22 24.27',
 '4 3A  9.35 x  2.43 2 3 22 3 3 32 211.78',
 'InfoInfoInfo',
 '5 CSSp4  3.00  1.21 3 2 32 3 2 32 24.21',
 '6 StSq4  3.90  1.90 3 3 23 3 3 23 25.80',
 '7 CCoSp3p4  3.50  1.21 3 2 32 3 3 22 24.71',
 '48.0559.44',
 'Program Components Factor',
 'Skating Skills  1.00 9.50 9.50 9.259.25 9.50 9.50 9.508.75 9.259.39',
 'Transition / Linking Footwork  1.00 9.50 9.00 8.759.25 9.50 9.50 9.258.50 9.009.18',
 'Performance / Execution  1.00 9.50 9.75 9.009.50 10.00 9.75 9.508.75 9.509.50',
 'Choreography / Composition  1

In [99]:
skater = None
scorecard = None
nhk15_men_short.scorecards = []
for line in rows:
    line = line.strip()
    
    skater_match = skater_re.match(line)
    if skater_match:
        if scorecard:
            nhk15_men_short.scorecards.append(scorecard)
        skater_info = {}
        
        for i, info in enumerate(('rank', 'name', 'country', 'starting_number', 'total_score', 'tes', 'pcs', 'deductions')):
            skater_info[info] = skater_match.group(i + 1)
        skater = Skater(skater_info['name'], skater_info['country'], 'men')
        scorecard = Scorecard(nhk15_men_short.url, seas1516, nhk15, nhk15_men, nhk15_men_short, skater,
                              skater_info['rank'], skater_info['starting_number'],
                              skater_info['total_score'], skater_info['tes'], skater_info['pcs'], skater_info['deductions'])
        continue
        
    elt_match = elt_re.match(line)
    component_match = component_re.match(line)
    if elt_match:
        elt_info = elt_match.groups()
        scorecard.add_element(elt_info)
    
    elif scorecard and scorecard.elements and tes_re.match(line):
#         scorecard.aggregate_elements()
        match = tes_re.match(line)
        print match.groups()
#         assert float(match.group(1)) == 
#         assert float(match.group(2))
    
    elif component_match:
        comp_info = component_match.groups()
        scorecard.add_component(comp_info)
    
    elif scorecard and len(scorecard.components) == 5:
        if pcs_re.match(line):
            match = pcs_re.match(line)
            print match.groups()
        elif 'Deductions' in line:
            # do something
            print line
        

('1', '4S', '', '10.50', '', '1.00', '1 2 11 1 2 00 1', '11.50')
('2', '4T+3T', '', '14.60', '', '2.57', '2 3 23 3 3 22 3', '17.17')
('3', 'FCSp4', '', '3.20', '', '1.07', '2 3 22 3 2 22 2', '4.27')
('4', '3A', '', '9.35', 'x', '2.43', '2 3 22 3 3 32 2', '11.78')
('5', 'CSSp4', '', '3.00', '', '1.21', '3 2 32 3 2 32 2', '4.21')
('6', 'StSq4', '', '3.90', '', '1.90', '3 3 23 3 3 23 2', '5.80')
('7', 'CCoSp3p4', '', '3.50', '', '1.21', '3 2 32 3 3 22 2', '4.71')
('48.05', '59.44')
Deductions:0.00
Rank Name Nation NumberSegment ElementProgram  ComponentDeductions
('1', '4Lz+3T', '', '17.90', '', '1.43', '1 1 22 2 1 12 1', '19.33')
('2', '3A', '', '8.50', '', '1.57', '1 2 22 2 1 21 1', '10.07')
('3', 'CCoSp3p4', '', '3.50', '', '0.21', '1 1 10 0 0 00 1', '3.71')
('4', 'StSq3', '', '3.30', '', '0.71', '2 1 22 1 1 11 2', '4.01')
('5', '4T', '', '11.33', 'x', '0.71', '1 1 2-1 1 0 01 1', '12.04')
('6', 'FCSp4', '', '3.20', '', '0.64', '1 1 22 1 1 21 1', '3.84')
('7', 'CSSp4', '', '3.00', '', '

In [101]:
yuzu = nhk15_men_short.scorecards[0]
# yuzu.print_scorecard()
kovtun = nhk15_men_short.scorecards[3]
kovtun.print_scorecard()

Skater(name='Maxim KOVTUN', country='RUS', discipline='men') 4 start: 11
TES: 42.81, PCS: 40.46, Total: 82.27
Deductions: -1.00
Element(number=1, name='4S+3T', info='', base_value=14.8, bonus=True, goe=0.71, goes=[0, 1, 1, 1, 1, 2, 0, 0, 1], points=15.51)
Element(number=2, name='4T', info='', base_value=10.3, bonus=True, goe=-2.63, goes=[-3, -2, -2, -1, -2, -2, -3, -2, -2], points=7.67)
Element(number=3, name='3A', info='', base_value=8.5, bonus=True, goe=-3.0, goes=[-3, -3, -3, -3, -3, -3, -3, -3, -3], points=5.5)
Element(number=4, name='CCoSp3p3', info='', base_value=3.0, bonus=True, goe=0.43, goes=[1, 0, 1, 1, 1, 1, 0, 1, 1], points=3.43)
Element(number=5, name='StSq4', info='', base_value=3.9, bonus=True, goe=1.1, goes=[1, 2, 1, 2, 2, 2, 1, 2, 1], points=5.0)
Element(number=6, name='CSSp2', info='', base_value=2.3, bonus=True, goe=0.5, goes=[2, 1, 1, 1, 1, 1, 1, 1, 1], points=2.8)
Element(number=7, name='FUSp3', info='', base_value=2.4, bonus=True, goe=0.5, goes=[2, 1, 1, 1, 1, 1, 

In [95]:
class Scorecard:
    def __init__(self, url, season, event, discipline, segment, skater,
                 rank, starting_number, total_score, tes, pcs, deductions):
        self.url = url
        self.season = season
        self.event = event
        self.discipline = discipline
        self.segment = segment
        self.skater = skater
        self.rank = int(rank)
        self.starting_number = int(starting_number)
        self.tes = float(tes)
        self.pcs = float(pcs)
        self.total_deductions = float(deductions)
        self.total_score = total_score
        
        self.elements = []
        self.components = []
        self.deductions = deductions
        
    def __repr__(self):
        return '{0} {1}: {2}'.format(self.season, self.segment, self.skater)
        
    def add_element(self, elt_info):
        number, name, info, base_value, bonus, goe, goes, points = elt_info
        number = int(number)
        base_value = float(base_value)
        bonus = bonus is not None
        goe = float(goe)
        
        # Split out individual judges' GOEs
        goe_re = re.compile('(-?[0123]\s*|-\s*)')
        goe_match = goe_re.findall(goes)
        if not goe_match:
            raise Exception(goes)
        goes = map(int, goe_match)
        
        points = float(points)
        self.elements.append(Element(number, name, info, base_value, bonus, goe, goes, points))

    def add_component(self, component_info):
        name, factor, scores, points = component_info
        factor = float(factor)
        comp_mark_re = re.compile('(\d?\d.\d\d\s*)')
        comp_match = comp_mark_re.findall(scores)
        if not comp_match:
            raise Exception(scores)
        scores = map(float, comp_match)
        
        points = float(points)
        self.components.append(ProgramComponent(name, factor, scores, points))
        
    def print_scorecard(self):
        print self.skater, self.rank, 'start:', self.starting_number
        print 'TES: {0}, PCS: {1}, Total: {2}'.format(self.tes, self.pcs, self.total_score)
        print 'Deductions: ' + str(self.deductions)
        for element in self.elements:
            print element
        for component in self.components:
            print component

In [15]:
class Discipline:
    def __init__(self, season, event, discipline):
        self.discipline = discipline  # of type DisciplineType
        self.season = season
        self.event = event
        self.segments = []  # list of Segments
        self.entries = []
        self.entries_url = None
        self.entries_fname = self.discipline.name + '_entries.html'
        self.results = []
        self.results_url = None
        self.results_fname = self.discipline.name + '_results.html'
    
    def __repr__(self):
        return self.event.name + ' ' + self.discipline.name
    
    def get_entries_results_pages(self):
        assert self.entries_url and self.results_url
        get_page(self.entries_url, self.season, self.event, self.entries_fname)
        get_page(self.results_url, self.season, self.event, self.results_fname)

In [16]:
class Event:
    def __init__(self, season, name):
        self.name = name      # the abbreviation + year
        self.season = season  # season object
        self.url = season.url + name + '/'
        self.dirpath = season.year + '/' + self.name + '/'
    
    def __repr__(self):
        return self.name

    def pdfs_to_csvs(self):
        subprocess.Popen('java -jar ' + tabula_path + ' -p all -b ' + self.dirpath, shell=True)

    def get_event_info(self, fetch_files=False):
        page = requests.get(self.url)
        try:
            tree = html.fromstring(page.content)
        except etree.ParserError:
            print 'Event not found: % s' % event
            return

        if not list(tree.iter('title')):
            print 'Event not found: % s' % event
            return
        else:
            print list(tree.iter('title'))[0].text

        # Drill down to the table part.
        table = tree.getchildren()[1].getchildren()[2].getchildren()
        elts = self._flatten_html_children(table)
        elts = self._flatten_html_children(elts)
        elts = self._flatten_html_children(elts)
        elts = self._flatten_html_children(elts)
        
        # 12 rows of actual table: cut out header and juniors
        rows = elts[2].getchildren()[0].getchildren()[1:13]

        self.disciplines = []
        for i, discipline_type in enumerate(list(DisciplineType)):
            
            discipline = Discipline(self.season, self, discipline_type)
            
            # Get entries and results pages.
            entries, results = self._flatten_html_children(rows[i * 3])
            discipline.entries_url = self.url + entries.attrib['href']
            discipline.results_url = self.url + results.attrib['href']
            if fetch_files:
                discipline.get_entries_results_pages()
            
            # Get segment files.
            for prog in (SegmentType.short, SegmentType.free):
                j = prog.value
                panel, detailed, scorecards = self._flatten_html_children(rows[i * 3 + j + 1])
                segment = Segment(self.url + scorecards.attrib['href'], self.season, self, discipline_type, prog)
                
                if fetch_files:
                    segment.get_page()
                discipline.segments.append(segment)

                panel = JudgePanel(self.url + panel.attrib['href'], self.season, self, discipline_type, segment)                
                if fetch_files:
                    panel.get_page()
            self.disciplines.append(discipline)
                
    def _flatten_html_children(self, elts):
        children = []
        for elt in elts:
            for child in elt.getchildren():
                children.append(child)
        return children

In [19]:
seas1415 = Season ('2015')
seas1516 = Season('2016')
for seas in (seas1415, seas1516):
    for event in seas.events:
        if event.name == 'gpfra2015':
            print 'GP France 2015 was cancelled partway through.'
            continue
        event.get_event_info()

ISU GP 2014 Hilton HHonors Skate America
ISU GP Skate Canada International 2014
ISU GP Lexus Cup of China 2014
ISU GP Rostelecom Cup 2014
ISU GP Trophee Eric Bompard 2014
ISU GP NHK Trophy 2014
ISU Grand Prix of Figure Skating Final 2014
ISU European Figure Skating Championships 2015
ISU Four Continents Figure Skating Championships
ISU World Figure Skating Championships 2015
ISU GP 2015 Progressive Skate America
ISU GP Skate Canada International 2015
ISU GP Audi Cup of China 2015
ISU GP Rostelecom Cup 2015
GP France 2015 was cancelled partway through.
ISU GP NHK Trophy 2015
ISU Grand Prix of Figure Skating Final
ISU European Figure Skating Championships 2016
ISU Four Continents Figure Skating Championships
ISU World Figure Skating Championships 2016


In [50]:
class Season:
    def __init__(self, spring_str_rep):
        champ_year = spring_str_rep
        gp_year = str(int(spring_str_rep) - 1)
        two_digit_season = int(spring_str_rep[2:])
        if two_digit_season <= 9:
            twotwo_year = '0' + str(two_digit_season - 1) + '0' + str(two_digit_season)
        elif two_digit_season <= 10:
            twotwo_year = '0' + str(two_digit_season - 1) + str(two_digit_season)
        else:
            twotwo_year = str(two_digit_season - 1) + str(two_digit_season)
        gpf_year = twotwo_year
        
        if int(spring_str_rep) % 4 == 2:
            champs = ['ec', 'fc', 'owg', 'wc']
        else:
            champs = ['ec', 'fc', 'wc']
        gps = ['usa', 'can', 'chn', 'rus', 'fra', 'jpn']
        gps = ['gp' + gp for gp in gps]
        
        self.year = twotwo_year
        
        event_names = [gp + gp_year for gp in gps]
        event_names.append('gpf' + twotwo_year)
        event_names += [champ + champ_year for champ in champs]
        
        self.url = 'http://www.isuresults.com/results/'
        if int(spring_str_rep) >= 2016:
            self.url += 'season'+ twotwo_year + '/'

        self.events = [Event(self, event_name) for event_name in event_names]

    def __repr__(self):
        return '\'' + self.year[:2] + '-\'' + self.year[2:]

In [165]:
def list_children(elts):
    children = []
    for elt in elts:
        for child in elt.getchildren():
            children.append(child)
    return children