## Scraping Collegiate Schedule from US Figure Skating Fanzone

#### Imports

In [1]:
import pandas as pd
import requests
import bs4

#### Global Variables

In [3]:
# Season
season = "2022-23"

# Link to US Figure Skating Fanzone
url = f"https://usfigureskatingfanzone.com/sports/collegiate-skating/schedule/{season}"

#### Making the Request

In [4]:
# IMPORTANT: Requests for generic user agents must not be made within 30s of one another
request = requests.get(url)

In [8]:
soup = bs4.BeautifulSoup(request.text)

In [29]:
def extract_urls(soup):
    '''
    Creates a nested dictionary of sections and competition numberings
    
    :param soup: A BeautifulSoup object from the US FS Fanzone url
    :returns: A nested dictionary filled with sections, competition 
    numberings, and main urls for each competition
    '''
    
    # Helper to determine the correct results pages
    def _is_results(x):
        return "INDIVIDUAL" in x if x else False
    
    # Helper to extract the competition section
    def _extract_section(x): 
        temp = x["aria-label"].split(":")[0].split()
        
        if temp[-1] == "Coast":
            return " ".join(temp[-2:])
        else:
            return temp[-1]
    
    # Extract the individual results
    objs = soup.find_all("a", attrs = {"aria-label": _is_results})
    
    # Find all of the sections
    URLS = []
    SECTIONS = []
    for x in objs:
        URLS.append(x["href"])
        SECTIONS.append(_extract_section(x)) 
        
    # Create counts for each section
    COUNTS = {SECTIONS[i]: 0 for i in range(len(SECTIONS))}
    
        
    return URLS


In [30]:
extract_urls(soup)

['https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32005/index.html',
 'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32005/index.html',
 'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32017/index.html',
 'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32017/index.html',
 'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32007/index.html',
 'https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32007/index.html',
 'https://usfigureskatingfanzone.com/documents/2022/11/22/Sunshine_Classic_2022_Individual_Results.pdf',
 'https://usfigureskatingfanzone.com/documents/2022/11/22/Sunshine_Classic_2022_Individual_Results.pdf']

In [19]:

soup.find_all("a", attrs = {"aria-label": is_results})

[<a aria-label="INDIVIDUAL RESULTS for Collegiate Skating vs Midwest: Case Western Reserve University on October 29, 2022 at " href="https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32005/index.html">INDIVIDUAL RESULTS</a>,
 <a aria-label="INDIVIDUAL RESULTS for Collegiate Skating vs Midwest: Case Western Reserve University on October 29, 2022 at " href="https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32005/index.html" target="_blank">INDIVIDUAL RESULTS</a>,
 <a aria-label="INDIVIDUAL RESULTS for Collegiate Skating vs Northeast: New York University on November 5, 2022 at " href="https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32017/index.html">INDIVIDUAL RESULTS</a>,
 <a aria-label="INDIVIDUAL RESULTS for Collegiate Skating vs Northeast: New York University on November 5, 2022 at " href="https://ijs.usfigureskating.org/leaderboard/nonqual_results/2022/32017/index.html" target="_blank">INDIVIDUAL RESULTS</a>,
 <a aria-label="INDIVIDUAL R