### Scrape starting goalies from daily faceoff

In [1]:
# Libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import datetime as dt

In [16]:
def process_goalie_DF_card(section_html):
    # Read in team name to 3 letter code dictionary
    with open('../../data/team_name_dictionary.txt', 'r') as f:
        team_name_dict = json.load(f)

    # Gather teams playing in this matchup
    teams_html = section_html.find(class_ = 'text-center text-3xl text-white').text
    teams = teams_html.split(' at ')
    assert len(teams) == 2, f'Only 2 teams should be on a game card: {teams}'
    teams = [team_name_dict[team.strip().lower()] for team in teams]
    
    # Gather the 2 goalie names
    names_html = section_html.find_all(class_ = 'text-center text-lg xl:text-2xl')
    names = [name.text for name in names_html]
    assert len(names) == 2, f'Only 2 names should be on a game card: {names}'
    #########
    # Normally, clean the names here
    #########

    # What is the status of the goalies for the upcoming game?
    status_to_code_dict = {'unconfirmed':'U', 
                        'projected':'P', 
                        'likely':'P', 
                        'expected':'P', 
                        'confirmed':'C'}

    status_html = section_html.find_all('div', {'class':['flex flex-row items-center justify-center gap-1 xl:justify-end', 'flex flex-row items-center justify-center gap-1 xl:justify-start']})
    status_list = [status.text.lower().strip() for status in status_html]
    status_list = [status_to_code_dict[status] for status in status_list]
    assert all([1 if status in ['U', 'P', 'C'] else 0 for status in status_list]), f'Unknown status in status list: {status_list}'

    #game_id = dt.datetime.strptime(date_of_games, '%Y-%m-%d').strftime('%y%m%d') + '-' + teams[0] + teams[1]
    #game_id


    return teams, names, status_list


In [17]:
def get_DF_goalies(date_of_games=None, today_flag=None):
    # Ensure at least 1 argument is sepcified
    if date_of_games is None and today_flag is None:
        raise ValueError("At least one of 'date_of_games' or 'today_flag' must be specified.")

    # Current date and time
    dt_now = dt.datetime.now()
    date_recorded = dt_now.date()
    time_recorded = dt_now.time().strftime(format = '%H:%M:%S')

    # If today's goalies are desired
    if today_flag:
        date_of_games = str(date_recorded)
    
    # URL for DF goalies
    url = 'https://www.dailyfaceoff.com/starting-goalies/' + date_of_games

    # Basically trick the site to think you are a genuine user?
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

    # Gather HTML from website
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    game_cards = soup.find_all('article', class_ = 'w-full')

    assert len(game_cards) > 0, f"No games were found on daily faceoff for {date_of_games}: {url}"

    # Process each game 
    teams = []
    names = []
    status_list = []
    for card in game_cards:
        t, n, s = process_goalie_DF_card(card)
        teams.extend(t)
        names.extend(n)
        status_list.extend(s)
        
    # Assemble data frame
    DF_goalies = pd.DataFrame({
        #'game_id':game_id,
        'date_recorded':date_recorded,
        'time_recorded':time_recorded,
        'date_game':date_of_games,
        'team':teams,
        'name':names,
        'status':status_list
    })

    return DF_goalies

In [21]:
# Test functions
x = get_DF_goalies(date_of_games='2023-10-12')

In [22]:
x

Unnamed: 0,date_recorded,time_recorded,date_game,team,name,status
0,2023-10-03,11:43:21,2023-10-12,PHI,Carter Hart,U
1,2023-10-03,11:43:21,2023-10-12,CBJ,Elvis Merzlikins,U
2,2023-10-03,11:43:21,2023-10-12,NYR,Igor Shesterkin,U
3,2023-10-03,11:43:21,2023-10-12,BUF,Devon Levi,U
4,2023-10-03,11:43:21,2023-10-12,DET,Ville Husso,U
5,2023-10-03,11:43:21,2023-10-12,NJD,Vitek Vanecek,U
6,2023-10-03,11:43:21,2023-10-12,FLA,Sergei Bobrovsky,U
7,2023-10-03,11:43:21,2023-10-12,MIN,Filip Gustavsson,U
8,2023-10-03,11:43:21,2023-10-12,SEA,Philipp Grubauer,U
9,2023-10-03,11:43:21,2023-10-12,NSH,Juuse Saros,U
