## USFSA Results Scraping

### Part 1: Scraping the Webpages

#### Imports

In [18]:
import pandas as pd
import requests
import bs4
from collections import defaultdict
import re

#### Create a get request for the main URL

In [19]:
urls = {'2023-nationals': 
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2023/32002/index.html',
        
        '2023-ride-the-tide': 
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32006/index.html',
        
        '2023-golden-bear-skate':
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32004/index.html',
        
        '2022-pioneer-open':
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32007/index.html',
        
        '2023-horsetooth-open':
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2023/33813/index.html',
        
        '2023-violet-classic':
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2023/33799/index.html',
        
        '2024-DU-Open':
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2024/33771/index.html',
        
        '2024-new-england-classic':
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2024/33831/index.html'
       }

In [20]:
# Change url to the competition of choice from the above dictionary
main_url = urls['2024-new-england-classic']
request  = requests.get(main_url)

In [21]:
# Global Variables
soup   = bs4.BeautifulSoup(request.text)
events = soup.find_all("td", attrs = {"rowspan": 1})
links  = soup.find_all("td", attrs = {"class": "cm rb"})

In [22]:
baseurl = main_url.replace("index.html", "")

# Helper function to extract links
def extract_link(x):
    return x.find("a")["href"]

# Extract the url ends for each webpage
ends = list(map(extract_link, links))

# Request urls for each webpage
webpages = [baseurl + i for i in ends]
    

#### Function for processing one results page

In [23]:
# Function to parse each results page
def parse_results(html, team = False):
    
    '''
    Takes an html text object containing the results of
    one group
    
    :params html: html text
    :returns: A DataFrame containing the place and university
    for each start
    '''
    
    # Create a soup object and extract the rows of the results table
    soup = bs4.BeautifulSoup(html.text)
    res  = soup.find_all("td", attrs = {"colspan":1})
    rows = soup.find_all("tr")
    
    # Events that haven't happened yet
    rem = [x for x in rows if (len(x.find_all("td")) == 2)]
    
    ccounts = []
    if rem:
        for i, x in enumerate(rem):
            if team:
                uni = x.text
            else:
                uni = re.findall('[A-z\s]*$', x.text.split(', ')[-1])[0]
            ccounts.append(uni)
        return ccounts
    
    # Events where results are live
    else:
        temp = [x for x in rows if (len(x.find_all("td")) == 9 or len(x.find_all("td")) == 7) or 
                len(x.find_all("td")) == 7]
        #temp = [x for x in rows if (len(x.find_all("td")) == 9 or len(x.find_all("td")) == 7)]

        # Extract the University names from each page
        out  = []
        for i, x in enumerate(res):
            if team:
                uni = x.text
            else:
                uni = x.text.split(", ")[-1]
            out.append([temp[i].find("td").text, uni, temp[i].find_all("td")[-1].text])
        
        return pd.DataFrame(out, columns = ["Place", "College", "Tie"])

#### Loop through each page and extract the data

In [24]:
DFS = []
CCOUNTS = []
for i, x in enumerate(webpages):    
    temp = requests.get(x)
    try:   
        data = parse_results(temp)
        
        if type(data) == list:
            CCOUNTS += data
        
        else:
            data = data.loc[~data["Tie"].str.contains("Withdraw")]
            DFS.append(data)
    except:
        pass   

### Part 2: Calculating the points awarded for each event

In [25]:
lookup = {24: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          23: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          22: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          21: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          20: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          19: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1],
          18: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1],
          17: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1],
          16: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1],
          15: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1],
          14: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1],
          13: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1],
          12: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
          11: [12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
          10: [12, 10, 8, 7, 6, 5, 4, 3, 2, 1],
          9: [12, 10, 8, 6, 5, 4, 3, 2, 1],
          8: [12, 10, 8, 6, 4, 3, 2, 1],
          7: [12, 10, 8, 6, 4, 2, 1],
          6: [12, 10, 8, 6, 4, 2],
          5: [10, 8, 6, 4, 2],
          4: [8, 6, 4, 2],
          3: [6, 4, 2],
          2: [6, 4],
          1: [6]
         }

In [26]:
OUT = []
for i, x in enumerate(DFS):
    num = len(x)
    
    # Assign a number of points to each column
    x = x.assign(points = lookup[num])

    # Handle ties
    temp = x.groupby("Place")["points"].transform(lambda x: x.mean())
    x = x.assign(points = temp)
    
    # Handle championship event edge case
    if "Championship" in events[i].text or "International" in events[i].text:
        x["points"] = x["points"] + 2
    
    OUT.append(x)

In [27]:
try:
    FULL = pd.concat(OUT)

    # Eliminate the error where it says 'U' instead of 'University'
    # Keep in mind there are still a few errors with the names of the colleges
    # that I have not fixed yet
    import re
    FULL['College'] = FULL['College'].apply(lambda x: re.sub('^U ', 'University ', x))
    FULL['College'] = FULL['College'].apply(lambda x: re.sub(' U$', ' University', x))
except:
    print('Competition has not started yet!')

#### Team Standings

In [28]:
A = FULL.groupby("College")["points"].sum().sort_values(ascending = False)
A

College
Boston University          279.0
New York University        272.0
Sacred Heart University    258.0
Dartmouth College          253.0
Cornell University         245.0
Northeastern University    172.0
Univ of Connecticut        151.0
Columbia University        130.0
University of Vermont       73.0
University of Rochester     52.0
Merrimack College           51.0
Brown University            50.0
Yale University             49.0
MIT                         41.0
Boston College              40.0
University of Maine         40.0
Quinnipiac University       31.0
UMASS Lowell                30.0
Harvard University          20.0
Providence College          14.0
Amherst College             12.0
Holy Cross                   5.0
Stony Brook University       1.0
Name: points, dtype: float64

#### Number of Starts per Team

In [29]:
B = FULL.groupby("College").count()["Place"].sort_values(ascending=False)
B

College
New York University        30
Boston University          30
Cornell University         30
Dartmouth College          30
Northeastern University    30
Sacred Heart University    30
Univ of Connecticut        29
Columbia University        28
University of Vermont      21
Merrimack College          17
University of Maine        14
MIT                        13
Quinnipiac University      12
University of Rochester    12
Yale University            12
Brown University           12
Boston College              8
UMASS Lowell                6
Harvard University          4
Amherst College             3
Providence College          2
Holy Cross                  2
Stony Brook University      1
Name: Place, dtype: int64

#### Points per start ratio

In [30]:
C = pd.merge(A.to_frame(), B.to_frame(), left_index=True, right_index=True)
C.columns = ["Points", "Number of Starts"]

C["Ratio"] = C["Points"] / C["Number of Starts"]
C.sort_values(by="Ratio", ascending = False)

Unnamed: 0_level_0,Points,Number of Starts,Ratio
College,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Boston University,279.0,30,9.3
New York University,272.0,30,9.066667
Sacred Heart University,258.0,30,8.6
Dartmouth College,253.0,30,8.433333
Cornell University,245.0,30,8.166667
Providence College,14.0,2,7.0
Northeastern University,172.0,30,5.733333
Univ of Connecticut,151.0,29,5.206897
Boston College,40.0,8,5.0
Harvard University,20.0,4,5.0


#### Starts Remaining

In [36]:
REM = pd.Series(CCOUNTS).value_counts()

In [40]:
D = pd.merge(C, REM, how='left', left_index=True, right_index=True)
D.columns = list(D.columns[:-1]) + ['Starts Remaining']
D['Predicted Points'] = D['Starts Remaining'] * D['Ratio'] + D['Points']
D.sort_values(by="Predicted Points", ascending = False)

Unnamed: 0_level_0,Points,Number of Starts,Ratio,Starts Remaining,Predicted Points
College,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Boston University,279.0,30,9.3,,
New York University,272.0,30,9.066667,,
Sacred Heart University,258.0,30,8.6,,
Dartmouth College,253.0,30,8.433333,,
Cornell University,245.0,30,8.166667,,
Northeastern University,172.0,30,5.733333,,
Univ of Connecticut,151.0,29,5.206897,,
Columbia University,130.0,28,4.642857,,
University of Vermont,73.0,21,3.47619,,
University of Rochester,52.0,12,4.333333,,


#### Number of Starts

In [41]:
C["Number of Starts"].sum()

376

#### To get a general count of how many of each type of official each competition has

In [42]:
import numpy as np

In [43]:
DFS = []
for i, x in enumerate(webpages):
    temp = requests.get(x)
    

In [44]:
def process_judges(text):
    soup = bs4.BeautifulSoup(text)
    tabs = np.array(list(map(lambda x: x.text, soup.find_all('td'))))
    offs = tabs[np.where(tabs == 'Judge 1')[0][0]:][1::3][:-1]
    return list(offs)

In [45]:
def process_referees(text):
    soup = bs4.BeautifulSoup(text)
    tabs = tabs = np.array(list(map(lambda x: x.text, soup.find_all('td'))))
    refs = tabs[np.where(tabs == 'Referee')[0][0] + 1]
    return [refs]

In [46]:
def process_accountants(text):
    soup = bs4.BeautifulSoup(text)
    tabs = tabs = np.array(list(map(lambda x: x.text, soup.find_all('td'))))
    accs = tabs[np.where(tabs == 'Accountant')[0][0] + 1]
    return [accs]

In [47]:
JUDGES = []
for i, x in enumerate(webpages):
    temp = requests.get(x)
    jlst = process_judges(temp.text)
    JUDGES += jlst

In [48]:
REFEREES = []
for i, x in enumerate(webpages):
    temp = requests.get(x)
    rlst = process_referees(temp.text)
    REFEREES += rlst

In [49]:
ACCOUNTANTS = []
for i, x in enumerate(webpages):
    temp = requests.get(x)
    alst = process_accountants(temp.text)
    ACCOUNTANTS += alst

In [50]:
pd.Series(JUDGES).unique()

array(['Elly Atwood', 'Christopher Brunner', 'Constance Cataldo',
       'Nancy Crossman', 'Theresa Dragos', 'Mary-Elizabeth Wightman',
       'Amelie Johnson', 'Susan Keogh', 'Meghan Lapointe', 'Marie Truppa',
       'Susan Scott', 'Sarahjayne Howland*', 'Arthur Bahr',
       'Emilieanne Koehnlein', 'Rebecca Ye', 'Chia Ying Lee',
       'Laura Days'], dtype=object)

In [51]:
pd.Series(REFEREES).unique()

array(['Elly Atwood', 'Ann Buckley', 'Susan Keogh', 'Sarahjayne Howland*',
       'Susan Scott'], dtype=object)

In [52]:
pd.Series(ACCOUNTANTS).unique()

array(['Caryn Bickerstaff*'], dtype=object)