In [324]:
from collections import namedtuple
from enum import Enum
from lxml import html, etree
import os
import requests

In [325]:
class DisciplineType(Enum):
    men = 0
    ladies = 1
    pairs = 2
    ice_dance = 3

In [326]:
class SegmentType(Enum):
    short = 0
    free = 1
    original_dance = 2

In [327]:
Judge = namedtuple('Judge', ['name', 'country'])

In [328]:
def get_page(url, season, event, fname):
    target_dir = season.year + '/' + event.name + '/'
    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    page = requests.get(url)
    with open(target_dir + fname, 'w+') as f:
        f.write(page.content)

In [329]:
class JudgePanel:
    def __init__(self, url):
        self.url = url

    def get_page(self, season, competition, discipline, segment):
        get_page(self.url, season, competition, discipline.name + '_' + segment.name + '_panel.html')
        
# ['referee', 'technical_controller', 'technical_specialist', 'asst_technical_specialist',
#                          'judges', 'num_judges', 'data_operator', 'replay_operator',
#                          'file_name', 'url'])  # where this panel info came from

In [330]:
Skater = namedtuple('Skater', ['name', 'country', 'discipline'])

In [331]:
Element = namedtuple('Element', ['name',  # string, e.g. 3A
                                 'base_value',
                                 'info',  # e.g. UR, edge call, REP
                                 'goes']) # list

In [332]:
ProgramComponent = namedtuple('ProgramComponent', ['name', 'factor', 'scores'])

In [333]:
class ScoreCard:
    def __init__(self):
    
#         self.rank
#         self.skater  # type Skater
#         self.start_number  # skate order
#         self.total_score
#         self.tes
#         self.pcs
#         self.deductions # maps deduction text to point value
#         self.elements   # list of Elements
#         self.pcs        # list of Program Components
        
#         self.page_number

IndentationError: expected an indented block (<ipython-input-333-34997714df6b>, line 14)

In [334]:
class Segment:
    def __init__(self, segment_type, url):
        self.type = SegmentType(segment_type)
#         self.scorecards
#         self.panel # JudgePanel
        self.url = url  # score url
        self.panel = None

    def get_page(self, season, competition, discipline):
        get_page(self.url, season, competition, discipline.name + '_' + self.type.name + '.pdf')


In [341]:
class Discipline:
    def __init__(self, discipline):
        self.discipline = discipline  # of type DisciplineType
        self.segments = []  # list of Segments
        self.entries = []
        self.entries_url = None
        self.results = []
        self.results_url = None
    
    def get_entries_results_pages(self, season, competition):
        assert self.entries_url and self.results_url
        get_page(self.entries_url, season, competition, self.discipline.name + '_entries.html')
        get_page(self.results_url, season, competition, self.discipline.name + '_results.html')

In [342]:
class Event:
    def __init__(self, season, name):
        self.name = name      # the abbreviation + year
        self.season = season  # season object
        self.url = 'http://www.isuresults.com/results/' + name + '/'  # will not work for 16-17 or 17-18 season
    
    def __repr__(self):
        return self.name
    
    def get_event_files(self):
        page = requests.get(self.url)
        try:
            tree = html.fromstring(page.content)
        except etree.ParserError:
            print 'Event not found: % s' % event
            return

        if not list(tree.iter('title')):
            print 'Event not found: % s' % event
            return
        else:
            print list(tree.iter('title'))[0].text
        
        # Drill down to the table part.
        table = tree.getchildren()[1].getchildren()[2].getchildren()
        elts = self._flatten_html_children(table)
        elts = self._flatten_html_children(elts)
        elts = self._flatten_html_children(elts)
        elts = self._flatten_html_children(elts)
        
        # 12 rows of actual table
        rows = elts[2].getchildren()[0].getchildren()[1:]
        
        assert len(rows) == 12
        
        self.disciplines = []
        for i, discipline_type in enumerate(list(DisciplineType)):
            
            discipline = Discipline(discipline_type)
            
            # Get entries and results pages.
            entries, results = self._flatten_html_children(rows[i * 3])
            discipline.entries_url = self.url + entries.attrib['href']
            discipline.results_url = self.url + results.attrib['href']
            discipline.get_entries_results_pages(self.season, self)
            
            # Get segment files.
            for prog in (SegmentType.short, SegmentType.free):
                j = prog.value
                panel, detailed, scorecards = self._flatten_html_children(rows[i * 3 + j + 1])
                segment = Segment(prog, self.url + scorecards.attrib['href'])
                segment.get_page(self.season, self, discipline_type)
                discipline.segments.append(segment)
                panel = JudgePanel(self.url + panel.attrib['href'])
                panel.get_page(self.season, self, discipline_type, prog)
            self.disciplines.append(discipline)
                
                
    def _flatten_html_children(self, elts):
        children = []
        for elt in elts:
            for child in elt.getchildren():
                children.append(child)
        return children

In [None]:
seas16 = Season('2015')
for event in seas16.events:
    event.get_event_files()

ISU GP 2014 Hilton HHonors Skate America


In [261]:
seas15.year

'1415'

In [162]:
class Season:
    def __init__(self, spring_str_rep):
        champ_year = spring_str_rep
        gp_year = str(int(spring_str_rep) - 1)
        two_digit_season = int(spring_str_rep[2:])
        if two_digit_season <= 9:
            twotwo_year = '0' + str(two_digit_season - 1) + '0' + str(two_digit_season)
        elif two_digit_season <= 10:
            twotwo_year = '0' + str(two_digit_season - 1) + str(two_digit_season)
        else:
            twotwo_year = str(two_digit_season - 1) + str(two_digit_season)
        gpf_year = twotwo_year
        
        champs = ['ec', 'fc', 'wc']
        gps = ['usa', 'can', 'chn', 'rus', 'fra', 'jpn']
        gps = ['gp' + gp for gp in gps]
        
        self.year = twotwo_year
        
        event_names = [gp + gp_year for gp in gps]
        event_names.append('gpf' + twotwo_year)
        event_names += [champ + champ_year for champ in champs]
        
        self.events = [Event(self, event_name) for event_name in event_names]
    
    def __repr__(self):
        return 'Season: \'' + self.year[:2] + '-\'' + self.year[2:]

In [122]:
def print_events(events):
    for event in events:
        page = requests.get('http://www.isuresults.com/results/' + event)
        try:
            tree = html.fromstring(page.content)
        except etree.ParserError:
            print 'Event not found: % s' % event
            continue

        if not list(tree.iter('title')):
            print 'Event not found: % s' % event
        else:
            print list(tree.iter('title'))[0].text
#             if 'Four' in list(tree.iter('title'))[0].text:
#                 body = etree.SubElement(tree, 'body')
#                 print type(body)
#                 # print etree.tostring(tree)

In [222]:
url = 'http://www.isuresults.com/results/fc2015'
page = requests.get(url)
tree = html.fromstring(page.content)
table = tree.getchildren()[1].getchildren()[2].getchildren()

In [165]:
def list_children(elts):
    children = []
    for elt in elts:
        for child in elt.getchildren():
            children.append(child)
    return children

In [227]:
elts = list_children(table)
elts = list_children(elts)
elts = list_children(elts)
elts = list_children(elts)
rows = elts[2].getchildren()[0].getchildren()[1:]

In [238]:
len(rows)

12

In [68]:
rows = [tr.getchildren() for tr in elts]
rows = rows[1:]  # cut out the header of the table

# Get rid of the time schedule at the bottom.
row_names = [row[0].text for row in rows]
seen_id = False
break_index = -1
for i, name in enumerate(row_names):
    if seen_id and name:
        break_index = i
        break
    if name == 'Ice Dance':
        seen_id = True
        continue

rows = rows[:break_index]

In [100]:
num_disciplines_seen = 0
discipline = None
segment = None
for row in rows:
    is_first_row = False
    if row[0].text is not None:
        discipline = row[0].text
        segment = None
        entries, results = list_children(rows[0])
        print entries.attrib['href'], results.attrib['href']

CAT001EN.HTM CAT001RS.HTM
CAT001EN.HTM CAT001RS.HTM
CAT001EN.HTM CAT001RS.HTM
CAT001EN.HTM CAT001RS.HTM


In [77]:
children = list_children(rows[0])
print children

[<Element a at 0x10e3ec260>, <Element a at 0x10e3ec368>]


In [83]:
children[0].attrib['href']

{'href': 'CAT001EN.HTM'}

In [85]:
url + '/' + children[0].attrib['href']

'http://www.isuresults.com/results/fc2015/CAT001EN.HTM'

In [86]:
page = requests.get('http://www.isuresults.com/results/fc2015/CAT001EN.HTM')

In [89]:
with open('temp.html', 'w+') as f:
    f.write(page.content)

In [91]:
page = requests.get('http://www.isuresults.com/results/fc2015/fc2015_Men_SP_Scores.pdf')
with open('temp.pdf', 'w+') as f:
    f.write(page.content)

IOError: [Errno 2] No such file or directory: 'folder/temp.pdf'

In [92]:
def get_page(url, season, competition, fname):
    target_dir = season + '/' + competition + '/'
    if not os.path.isdir(target_dir):
        os.makedirs(target_dir)
    page = requests.get(url)
    with open(target_dir + fname, 'w+') as f:
        f.write(page.content)