## USFSA Results Scraping

### Part 1: Scraping the Webpages

#### Imports

In [64]:
import pandas as pd
import requests
import bs4
from collections import defaultdict
import re

#### Create a get request for the main URL

In [66]:
urls = {'2023-nationals': 
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2023/32002/index.html',
        
        '2023-ride-the-tide': 
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32006/index.html',
        
        '2023-golden-bear-skate':
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32004/index.html',
        
        '2022-pioneer-open':
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32007/index.html',
        
        '2023-horsetooth-open':
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2023/33813/index.html',
        
        '2023-violet-classic':
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2023/33799/index.html',
        
        '2024-DU-Open':
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2024/33771/index.html',
        
        '2024-new-england-classic':
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2024/33831/index.html',
        
        '2024-city-of-angels':
        'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2024/33785/index.html'
       }

In [67]:
# Replacement dictionaries; sometimes the university names in the results are not
# consistent!

# Pacific Coast
pc_rep = {
    'Ang...$': 'Angeles', 
    'Colorad...$': 'Colorado Springs',
    'Santa B...$': 'Santa Barbara',
    'Norther...$': 'Northern Colorado',
    'Lo...$': 'Los Angeles',
    'Sa...$': 'San Diego',
    'Souther...$': 'Southern CA'
}

In [108]:
# Change url to the competition of choice from the above dictionary
main_url = urls['2024-DU-Open']
request  = requests.get(main_url)

In [109]:
# Global Variables
soup   = bs4.BeautifulSoup(request.text)
events = soup.find_all("td", attrs = {"rowspan": 1})
links  = soup.find_all("td", attrs = {"class": "cm rb"})

In [110]:
baseurl = main_url.replace("index.html", "")

# Helper function to extract links
def extract_link(x):
    return x.find("a")["href"]

# Extract the url ends for each webpage
ends = list(map(extract_link, links))

# Request urls for each webpage
webpages = [baseurl + i for i in ends]
    

#### Function for processing one results page

In [111]:
# Function to parse each results page
def parse_results(html, team = False):
    
    '''
    Takes an html text object containing the results of
    one group
    
    :params html: html text
    :returns: A DataFrame containing the place and university
    for each start
    '''
    
    # Create a soup object and extract the rows of the results table
    soup = bs4.BeautifulSoup(html.text)
    res  = soup.find_all("td", attrs = {"colspan":1})
    rows = soup.find_all("tr")
    
    # Events that haven't happened yet
    rem = [x for x in rows if (len(x.find_all("td")) == 2)]
    
    ccounts = []
    if rem:
        for i, x in enumerate(rem):
            if team:
                uni = x.text
            else:
                uni = re.findall('[A-z\-\.\s]*$', x.text.split(', ')[-1])[0]
            ccounts.append(uni)
        return ccounts
    
    # Events where results are live
    else:
        temp = [x for x in rows if (len(x.find_all("td")) == 9 or len(x.find_all("td")) == 7) or 
                len(x.find_all("td")) == 7]
        #temp = [x for x in rows if (len(x.find_all("td")) == 9 or len(x.find_all("td")) == 7)]

        # Extract the University names from each page
        out  = []
        for i, x in enumerate(res):
            if team:
                uni = x.text
            else:
                uni = x.text.split(", ")[-1]
            out.append([temp[i].find("td").text, uni, temp[i].find_all("td")[-1].text])
        
        return pd.DataFrame(out, columns = ["Place", "College", "Tie"])

#### Loop through each page and extract the data

In [112]:
DFS = []
CCOUNTS = []
for i, x in enumerate(webpages):    
    temp = requests.get(x)
    try:   
        data = parse_results(temp)
        
        if type(data) == list:
            CCOUNTS += data
        
        else:
            data = data.loc[~data["Tie"].str.contains("Withdraw")]
            DFS.append(data)
    except:
        pass   

### Part 2: Calculating the points awarded for each event

In [113]:
lookup = {24: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          23: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          22: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          21: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          20: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
          19: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1],
          18: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1, 1],
          17: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1, 1],
          16: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1, 1],
          15: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1, 1],
          14: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1, 1],
          13: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1],
          12: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
          11: [12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
          10: [12, 10, 8, 7, 6, 5, 4, 3, 2, 1],
          9: [12, 10, 8, 6, 5, 4, 3, 2, 1],
          8: [12, 10, 8, 6, 4, 3, 2, 1],
          7: [12, 10, 8, 6, 4, 2, 1],
          6: [12, 10, 8, 6, 4, 2],
          5: [10, 8, 6, 4, 2],
          4: [8, 6, 4, 2],
          3: [6, 4, 2],
          2: [6, 4],
          1: [6]
         }

In [114]:
OUT = []
for i, x in enumerate(DFS):
    num = len(x)
    
    # Assign a number of points to each column
    x = x.assign(points = lookup[num])

    # Handle ties
    temp = x.groupby("Place")["points"].transform(lambda x: x.mean())
    x = x.assign(points = temp)
    
    # Handle championship event edge case
    if "Championship" in events[i].text or "International" in events[i].text:
        x["points"] = x["points"] + 2
    
    OUT.append(x)

In [115]:
try:
    FULL = pd.concat(OUT)

    # Eliminate the error where it says 'U' instead of 'University'
    # Keep in mind there are still a few errors with the names of the colleges
    # that I have not fixed yet
    import re
    FULL['College'] = FULL['College'].apply(lambda x: re.sub('^U ', 'University ', x))
    FULL['College'] = FULL['College'].apply(lambda x: re.sub(' U$', ' University', x))
except:
    print('Competition has not started yet!')

#### Team Standings

In [116]:
A = FULL.groupby("College")["points"].sum().sort_values(ascending = False)
A

College
Univ of Denver              255.0
Univ of Calif Los Ang...    226.0
Univ of Calif San Diego     203.0
Univ of Calif Berkeley      200.0
Univ of Colo Boulder        199.0
Stanford Univ               182.0
Univ of Washington          181.0
Arizona State Univ          135.0
Colo State Univ             128.0
Utah State Univ             126.0
Univ of Colo Colo Spr...    113.0
Univ of Southern Calif      108.0
Univ of Calif Davis          34.0
Univ of Calif Santa B...     22.0
Colo College                 19.0
Western Washington Univ      14.0
Name: points, dtype: float64

#### Number of Starts per Team

In [117]:
B = FULL.groupby("College").count()["Place"].sort_values(ascending=False)
B

College
Univ of Calif Los Ang...    30
Univ of Denver              30
Colo State Univ             29
Stanford Univ               29
Univ of Calif San Diego     29
Univ of Colo Boulder        29
Univ of Washington          29
Arizona State Univ          28
Univ of Calif Berkeley      28
Univ of Southern Calif      24
Univ of Colo Colo Spr...    19
Utah State Univ             19
Univ of Calif Davis          8
Colo College                 7
Univ of Calif Santa B...     5
Western Washington Univ      4
Name: Place, dtype: int64

#### Points per start ratio

In [118]:
C = pd.merge(A.to_frame(), B.to_frame(), left_index=True, right_index=True)
C.columns = ["Points", "Number of Starts"]

C["Ratio"] = C["Points"] / C["Number of Starts"]
C.sort_values(by="Ratio", ascending = False)

Unnamed: 0_level_0,Points,Number of Starts,Ratio
College,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Univ of Denver,255.0,30,8.5
Univ of Calif Los Ang...,226.0,30,7.533333
Univ of Calif Berkeley,200.0,28,7.142857
Univ of Calif San Diego,203.0,29,7.0
Univ of Colo Boulder,199.0,29,6.862069
Utah State Univ,126.0,19,6.631579
Stanford Univ,182.0,29,6.275862
Univ of Washington,181.0,29,6.241379
Univ of Colo Colo Spr...,113.0,19,5.947368
Arizona State Univ,135.0,28,4.821429


#### Starts Remaining

In [119]:
REM = pd.Series(CCOUNTS).replace(pc_rep, regex=True)
REM = REM.str.replace('Univ.', 'University')
REM = REM.value_counts()
REM.sum()

0

In [120]:
D = pd.merge(C, REM, how='left', left_index=True, right_index=True)
D.columns = list(D.columns[:-1]) + ['Starts Remaining']
D['Predicted Points'] = D['Starts Remaining'] * D['Ratio'] + D['Points']
D.sort_values(by="Predicted Points", ascending = False)

Unnamed: 0_level_0,Points,Number of Starts,Ratio,Starts Remaining,Predicted Points
College,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Univ of Denver,255.0,30,8.5,,
Univ of Calif Los Ang...,226.0,30,7.533333,,
Univ of Calif San Diego,203.0,29,7.0,,
Univ of Calif Berkeley,200.0,28,7.142857,,
Univ of Colo Boulder,199.0,29,6.862069,,
Stanford Univ,182.0,29,6.275862,,
Univ of Washington,181.0,29,6.241379,,
Arizona State Univ,135.0,28,4.821429,,
Colo State Univ,128.0,29,4.413793,,
Utah State Univ,126.0,19,6.631579,,


#### Total Number of Starts

In [121]:
C["Number of Starts"].sum()

347

#### To get a general count of how many of each type of official each competition has

In [122]:
import numpy as np

In [123]:
DFS = []
for i, x in enumerate(webpages):
    temp = requests.get(x)
    

In [124]:
def process_judges(text):
    soup = bs4.BeautifulSoup(text)
    tabs = np.array(list(map(lambda x: x.text, soup.find_all('td'))))
    offs = tabs[np.where(tabs == 'Judge 1')[0][0]:][1::3][:-1]
    return list(offs)

In [125]:
def process_referees(text):
    soup = bs4.BeautifulSoup(text)
    tabs = tabs = np.array(list(map(lambda x: x.text, soup.find_all('td'))))
    refs = tabs[np.where(tabs == 'Referee')[0][0] + 1]
    return [refs]

In [126]:
def process_accountants(text):
    soup = bs4.BeautifulSoup(text)
    tabs = tabs = np.array(list(map(lambda x: x.text, soup.find_all('td'))))
    accs = tabs[np.where(tabs == 'Accountant')[0][0] + 1]
    return [accs]

In [127]:
JUDGES = []
for i, x in enumerate(webpages):
    temp = requests.get(x)
    jlst = process_judges(temp.text)
    JUDGES += jlst

In [128]:
REFEREES = []
for i, x in enumerate(webpages):
    temp = requests.get(x)
    rlst = process_referees(temp.text)
    REFEREES += rlst

In [None]:
ACCOUNTANTS = []
for i, x in enumerate(webpages):
    temp = requests.get(x)
    alst = process_accountants(temp.text)
    ACCOUNTANTS += alst

In [None]:
pd.Series(JUDGES).unique()

In [None]:
pd.Series(REFEREES).unique()

In [None]:
pd.Series(ACCOUNTANTS).unique()