In [355]:
from collections import namedtuple
from enum import Enum
from lxml import html, etree
import os
import requests

In [356]:
class DisciplineType(Enum):
    men = 0
    ladies = 1
    pairs = 2
    ice_dance = 3

In [357]:
class SegmentType(Enum):
    short = 0
    free = 1
    original_dance = 2

In [358]:
Judge = namedtuple('Judge', ['name', 'country'])

In [359]:
def get_page(url, season, event, fname):
    target_dir = season.year + '/' + event.name + '/'
    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    page = requests.get(url)
    with open(target_dir + fname, 'w+') as f:
        f.write(page.content)

In [360]:
class JudgePanel:
    def __init__(self, url):
        self.url = url

    def get_page(self, season, competition, discipline, segment):
        get_page(self.url, season, competition, discipline.name + '_' + segment.name + '_panel.html')
        
# ['referee', 'technical_controller', 'technical_specialist', 'asst_technical_specialist',
#                          'judges', 'num_judges', 'data_operator', 'replay_operator',
#                          'file_name', 'url'])  # where this panel info came from

In [361]:
Skater = namedtuple('Skater', ['name', 'country', 'discipline'])

In [362]:
Element = namedtuple('Element', ['name',  # string, e.g. 3A
                                 'base_value',
                                 'info',  # e.g. UR, edge call, REP
                                 'goes']) # list

In [363]:
ProgramComponent = namedtuple('ProgramComponent', ['name', 'factor', 'scores'])

In [364]:
class ScoreCard:
    def __init__(self):
    
#         self.rank
#         self.skater  # type Skater
#         self.start_number  # skate order
#         self.total_score
#         self.tes
#         self.pcs
#         self.deductions # maps deduction text to point value
#         self.elements   # list of Elements
#         self.pcs        # list of Program Components
        
#         self.page_number

IndentationError: expected an indented block (<ipython-input-364-34997714df6b>, line 14)

In [365]:
class Segment:
    def __init__(self, segment_type, url):
        self.type = SegmentType(segment_type)
#         self.scorecards
#         self.panel # JudgePanel
        self.url = url  # score url
        self.panel = None

    def get_page(self, season, competition, discipline):
        get_page(self.url, season, competition, discipline.name + '_' + self.type.name + '.pdf')


In [366]:
class Discipline:
    def __init__(self, discipline):
        self.discipline = discipline  # of type DisciplineType
        self.segments = []  # list of Segments
        self.entries = []
        self.entries_url = None
        self.results = []
        self.results_url = None
    
    def get_entries_results_pages(self, season, competition):
        assert self.entries_url and self.results_url
        get_page(self.entries_url, season, competition, self.discipline.name + '_entries.html')
        get_page(self.results_url, season, competition, self.discipline.name + '_results.html')

In [383]:
class Event:
    def __init__(self, season, name):
        self.name = name      # the abbreviation + year
        self.season = season  # season object
        self.url = season.url + name + '/'
    
    def __repr__(self):
        return self.name
    
    def get_event_files(self):
        page = requests.get(self.url)
        try:
            tree = html.fromstring(page.content)
        except etree.ParserError:
            print 'Event not found: % s' % event
            return

        if not list(tree.iter('title')):
            print 'Event not found: % s' % event
            return
        else:
            print list(tree.iter('title'))[0].text
        
        # Drill down to the table part.
        table = tree.getchildren()[1].getchildren()[2].getchildren()
        elts = self._flatten_html_children(table)
        elts = self._flatten_html_children(elts)
        elts = self._flatten_html_children(elts)
        elts = self._flatten_html_children(elts)
        
        # 12 rows of actual table: cut out header and juniors
        rows = elts[2].getchildren()[0].getchildren()[1:13]

        self.disciplines = []
        for i, discipline_type in enumerate(list(DisciplineType)):
            
            discipline = Discipline(discipline_type)
            
            # Get entries and results pages.
            entries, results = self._flatten_html_children(rows[i * 3])
            discipline.entries_url = self.url + entries.attrib['href']
            discipline.results_url = self.url + results.attrib['href']
            discipline.get_entries_results_pages(self.season, self)
            
            # Get segment files.
            for prog in (SegmentType.short, SegmentType.free):
                j = prog.value
                panel, detailed, scorecards = self._flatten_html_children(rows[i * 3 + j + 1])
                segment = Segment(prog, self.url + scorecards.attrib['href'])
                segment.get_page(self.season, self, discipline_type)
                discipline.segments.append(segment)
                panel = JudgePanel(self.url + panel.attrib['href'])
                panel.get_page(self.season, self, discipline_type, prog)
            self.disciplines.append(discipline)
                
                
    def _flatten_html_children(self, elts):
        children = []
        for elt in elts:
            for child in elt.getchildren():
                children.append(child)
        return children

In [389]:
for year in ('2013', '2014', '2015'):
    season = Season(year)
    for event in season.events:
        event.get_event_files()

ISU GP Hilton HHonors Skate America 2012


ValueError: too many values to unpack

In [387]:
seas = Season('2016')
for event in seas.events:
    if event.name == 'gpfra2015':
        print 'GP France 2015 was cancelled partway through.'
        continue
    event.get_event_files()

ISU GP 2015 Progressive Skate America
ISU GP Skate Canada International 2015
ISU GP Audi Cup of China 2015
ISU GP Rostelecom Cup 2015
GP France 2015 was cancelled partway through.
ISU GP NHK Trophy 2015
ISU Grand Prix of Figure Skating Final
ISU European Figure Skating Championships 2016
ISU Four Continents Figure Skating Championships
ISU World Figure Skating Championships 2016


In [388]:
class Season:
    def __init__(self, spring_str_rep):
        champ_year = spring_str_rep
        gp_year = str(int(spring_str_rep) - 1)
        two_digit_season = int(spring_str_rep[2:])
        if two_digit_season <= 9:
            twotwo_year = '0' + str(two_digit_season - 1) + '0' + str(two_digit_season)
        elif two_digit_season <= 10:
            twotwo_year = '0' + str(two_digit_season - 1) + str(two_digit_season)
        else:
            twotwo_year = str(two_digit_season - 1) + str(two_digit_season)
        gpf_year = twotwo_year
        
        if int(spring_str_rep) % 4 == 0:
            champs = ['ec', 'fc', 'owg', 'wc']
        else:
            champs = ['ec', 'fc', 'wc']
        gps = ['usa', 'can', 'chn', 'rus', 'fra', 'jpn']
        gps = ['gp' + gp for gp in gps]
        
        self.year = twotwo_year
        
        event_names = [gp + gp_year for gp in gps]
        event_names.append('gpf' + twotwo_year)
        event_names += [champ + champ_year for champ in champs]
        
        self.url = 'http://www.isuresults.com/results/'
        if int(spring_str_rep) >= 2016:
            self.url += 'season'+ twotwo_year + '/'

        self.events = [Event(self, event_name) for event_name in event_names]

    def __repr__(self):
        return 'Season: \'' + self.year[:2] + '-\'' + self.year[2:]

In [165]:
def list_children(elts):
    children = []
    for elt in elts:
        for child in elt.getchildren():
            children.append(child)
    return children