## USFSA Results Scraping

### Part 1: Scraping the Webpages

#### Imports

In [1]:
import pandas as pd
import requests
import bs4

#### Create a get request for the main URL

In [2]:
main_url = "https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32007/index.html"
#main_url = "https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32017/index.html"
#main_url = "https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32005/index.html"
request  = requests.get(main_url)

In [3]:
# Global Variables
soup   = bs4.BeautifulSoup(request.text)
events = soup.find_all("td", attrs = {"rowspan": 1})
links  = soup.find_all("td", attrs = {"class": "cm rb"})

In [4]:
baseurl = main_url.replace("index.html", "")

# Helper function to extract links
def extract_link(x):
    return x.find("a")["href"]

# Extract the url ends for each webpage
ends = list(map(extract_link, links))

# Request urls for each webpage
webpages = [baseurl + i for i in ends]
    

#### Function for processing one results page

In [5]:
# Function to parse each results page
def parse_results(html, team = False):
    
    '''
    Takes an html text object containing the results of
    one group
    
    :params html: html text
    :returns: A DataFrame containing the place and university
    for each start
    '''
    
    # Create a soup object and extract the rows of the results table
    soup = bs4.BeautifulSoup(html.text)
    res  = soup.find_all("td", attrs = {"colspan":1})
    rows = soup.find_all("tr")
    rows = [x for x in rows if (len(x.find_all("td")) == 9 or len(x.find_all("td")) == 7)]
    
    # Extract the University names from each page
    out = []
    for i, x in enumerate(res):
        if team:
            uni = x.text
        else:
            uni = x.text.split(", ")[-1]
        out.append([rows[i].find("td").text, uni, rows[i].find_all("td")[-1].text])
        
    return pd.DataFrame(out, columns = ["Place", "College", "Tie"])

#### Loop through each page and extract the data

In [6]:
DFS = []
for i, x in enumerate(webpages):
    temp = requests.get(x)
    try:
        data = parse_results(temp)
        data = data.loc[~data["Tie"].str.contains("Withdraw")]
        DFS.append(data)
    except:
        pass

### Part 2: Calculating the points awarded for each event

In [7]:
lookup = {13: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1],
          12: [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
          11: [12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
          10: [12, 10, 8, 7, 6, 5, 4, 3, 2, 1],
          9: [12, 10, 8, 6, 5, 4, 3, 2, 1],
          8: [12, 10, 8, 6, 4, 3, 2, 1],
          7: [12, 10, 8, 6, 4, 2, 1],
          6: [12, 10, 8, 6, 4, 2],
          5: [10, 8, 6, 4, 2],
          4: [8, 6, 4, 2],
          3: [6, 4, 2],
          2: [6, 4],
          1: [6]
         }

In [8]:
OUT = []
for i, x in enumerate(DFS):
    num = len(x)
    
    # Assign a number of points to each column
    x = x.assign(points = lookup[num])

    # Handle ties
    temp = x.groupby("Place")["points"].transform(lambda x: x.mean())
    x = x.assign(points = temp)
    
    # Handle championship event edge case
    if "Championship" in events[i].text or "International" in events[i].text:
        x["points"] = x["points"] + 2
    
    OUT.append(x)

In [9]:
FULL = pd.concat(OUT)

In [10]:
FULL.groupby("College")["points"].sum().sort_values(ascending = False)

College
Univ. of CA Los Angeles     218.0
University of Denver        214.0
Univ. of CA Berkeley        193.0
Univ. of CA San Diego       176.0
Arizona State University    131.0
Stanford University         127.5
Univ. of CO Boulder         103.0
CO State University          79.5
Univ. of CO COSpgs           65.0
Utah State University        63.0
Colorado College             59.0
Univ. of CA Irvine           38.0
Western WA Univ.             27.0
Univ. of Northern CO         17.0
University of Wyoming        16.0
Name: points, dtype: float64