In [136]:
# Libraries
import pandas as pd
import numpy as np
import datetime as dt
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [137]:
# Current date and time
dt_now = dt.datetime.now()
date_recorded = dt_now.date()
time_recorded = dt_now.time().strftime(format = '%H:%M:%S')

date_of_games = date_recorded

In [138]:
# URL to scrape
example_url = 'https://sportsbook.draftkings.com/leagues/hockey/nhl?category=goalscorer'

In [139]:
#==========Enter into web driver scrape==========#
# Configure Chrome options
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run in headless mode

# Create a WebDriver instance
driver = webdriver.Chrome(options=options)

# Open the URL
driver.get(example_url)

# Explicitly wait for the elements with the specified class to appear
wait = WebDriverWait(driver, 30)  # Wait for up to 30 seconds

# Get each game card
game_cards = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'sportsbook-event-accordion__wrapper.expanded')))
game_cards_final = []

In [140]:
# For each card
for card in game_cards:
    # Get the date
    card_date = card.find_element(By.CLASS_NAME, 'sportsbook-event-accordion__date').text

    if 'TODAY' in card_date:
        card_date_final = date_recorded
    elif 'TOMORROW' in card_date:
        # ===NOT TESTED YET===
        card_date_final = date_recorded + dt.timedelta(days=1)
    else:
        # ===NOT TESTED YET===
        # Remove parts of date (day of week, time, the TH in 10TH)
        month_mapping = {
            'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4, 'MAY': 5, 'JUN': 6,
            'JUL': 7, 'AUG': 8, 'SEP': 9, 'OCT': 10, 'NOV': 11, 'DEC': 12
        }
        split_date = card_date.split()
        parts_to_keep = []
        for part in split_date:
            if part.endswith('AM') or part.endswith('PM') or part in ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']:
                continue
            elif part.endswith('TH'):
                part = part[:-2]
                parts_to_keep.append(part)
            else:
                parts_to_keep.append(month_mapping[part])
        if len(parts_to_keep) != 2:
            print(f'A date was processed incorrectly: {parts_to_keep}')
            continue
        card_date_final = dt.date(dt.datetime.now().year, int(parts_to_keep[0]), int(parts_to_keep[1]))

    # If the game card date is today's date, process the HTML to record team and odds
    if card_date_final == date_of_games:
        game_cards_final.append(card) 

In [153]:
# Read in team name to 3 letter code dictionary
with open('../../data/team_name_dictionary.txt', 'r') as f:
    team_name_dict = json.load(f)

In [156]:
home_teams = []
away_teams = []

for card in game_cards_final:
    teams = card.find_element(By.CLASS_NAME, 'sportsbook-event-accordion__title-wrapper').text
    teams = teams.split('\n')
    teams = [team.strip().lower() for team in teams]
    teams.pop(teams.index('at'))
    teams = [team_name_dict[team] for team in teams]
    assert len(teams) == 2, f"Incorrect number of teams found: {teams}\n"

['NYR', 'BUF']
['DET', 'NJD']
['PHI', 'CBJ']
['STL', 'DAL']
['SEA', 'NSH']
['FLA', 'MIN']
['VGK', 'SJS']


In [159]:
[teams[0]] * 4

['VGK', 'VGK', 'VGK', 'VGK']

In [127]:
names = []
odds = []

for card in game_cards_final:
    # Find index of "Anytime Goalscorer" column 
    colnames = card.find_element(By.CLASS_NAME, 'scorer-7__header-wrapper').text
    colnames = colnames.split('\n')
    colnames = [col.strip().lower() for col in colnames]
    assert len(colnames) == 3, f"3 column names should've been located: {colnames}\n"
    assert 'anytime scorer' in colnames, f"'anytime scorer' column not found. {colnames}\n"
    anytime_index = colnames.index('anytime scorer')
    anytime_end_index = anytime_index - 3

    # Locate teams on each card
    teams = card.find_element(By.CLASS_NAME, 'sportsbook-event-accordion__title-wrapper').text
    teams = teams.split('\n')
    teams = [team.strip().lower() for team in teams]
    teams.pop(teams.index('at'))
    teams = [team_name_dict[team] for team in teams]
    assert len(teams) == 2, f"Incorrect number of teams found: {teams}\n"

    # Locate names and odds together since if done separately, an ordering issue arises from the scrape
    names_and_odds = card.find_elements(By.CLASS_NAME, 'scorer-7__body')
    names_and_odds_split = [info.text.split('\n') for info in names_and_odds]

    # Generate list of names
    names_single_game = [splits[0] for splits in names_and_odds_split]
    # PASS THROUGH CLEAN NAMES FUNCTION HERE
    
    # Generate list of odds
    odds_single_game = [splits[anytime_end_index] for splits in names_and_odds_split]
    odds_single_game = [int(odd.replace("−", "-")) for odd in odds_single_game]

    # Append these values to running list
    assert len(names_single_game) == len(odds_single_game), f"Number of names should be the same as the number of odds:\nNum names:{len(names_single_game)}\nNum odds: {len(odds_single_game)}\n"
    names.extend(names_single_game)
    odds.extend(odds_single_game)
    away_teams.extend([teams[0]] * len(names_single_game))
    home_teams.extend([teams[1]] * len(names_single_game))

driver.quit()
#==========Exit web driver scrape==========#

In [133]:
# Create final data frame
anytime_scorer = pd.DataFrame({
    'player_id':np.nan,
    'date_recorded':date_recorded,
    'time_recorded':time_recorded,
    'date_game':date_of_games,
    'name':names,
    'odds':odds
})

In [134]:
# Remove the 'No goalscorer' rows
anytime_scorer = anytime_scorer[anytime_scorer['name'] != 'No Goalscorer']

In [135]:
anytime_scorer

Unnamed: 0,player_id,date_recorded,time_recorded,date_game,name,odds
0,,2023-10-12,17:16:42,2023-10-12,Tage Thompson,120
1,,2023-10-12,17:16:42,2023-10-12,Mika Zibanejad,135
2,,2023-10-12,17:16:42,2023-10-12,Chris Kreider,145
3,,2023-10-12,17:16:42,2023-10-12,Alex Tuch,160
4,,2023-10-12,17:16:42,2023-10-12,Artemi Panarin,185
...,...,...,...,...,...,...
282,,2023-10-12,17:16:42,2023-10-12,Nicolas Hague,1400
283,,2023-10-12,17:16:42,2023-10-12,Jan Rutta,1500
284,,2023-10-12,17:16:42,2023-10-12,Brayden McNabb,2000
286,,2023-10-12,17:16:42,2023-10-12,Marc-Edouard Vlasic,2800
