## These odds are taken from BetUS sportsbook archive for the 21 & 22 season
Note that this will not be as precise as being able to use DK's odds for all of these games. Hopefully, it is close enough to be effective in improving model training and validation.

In [136]:
# Libraries
import numpy as np
import pandas as pd
import datetime as dt
import requests
from bs4 import BeautifulSoup
import json

In [137]:
# Two seasons of odds
betus_21 = 'https://www.sportsbookreviewsonline.com/scoresoddsarchives/nhl-odds-2021/'
betus_22  ='https://www.sportsbookreviewsonline.com/scoresoddsarchives/nhl-odds-2021-22/'
# Header is needed since normally the website forbids web scraping?
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

In [138]:
# Read in team name to 3 letter code dictionary
# Read in team name dictionary for cleaning
with open('../data/team_name_dictionary.txt', 'r') as f:
    # Load the dictionary from the file
    team_name_dict = json.load(f)

### 2021 Season

In [139]:
# Read in table from web
response_21 = requests.get(betus_21, headers=headers)

In [140]:
# Get table from response
archived_odds_21 = pd.read_html(response_21.text)[0]

In [141]:
# Only keep required columns (for now we are only keeping moneyline, but we could also keep puckline and OU)
archived_ml_odds_21 = archived_odds_21.iloc[:,[0, 3, 8]].rename(columns={0:'date_game', 3:'team', 8:'ml_odds'}).drop(0)

In [142]:
# Convert team name to 3 letter code
archived_ml_odds_21['team'] = archived_ml_odds_21['team'].str.lower().replace(team_name_dict)

In [143]:
# Change date to standard format
archived_ml_odds_21['date_game'] = pd.to_datetime('0' + archived_ml_odds_21['date_game'].astype(str) + '21', format = '%m%d%y')

In [144]:
# Filter for only the regular season
# First, only games on or before the date 2021-05-19 should be included
archived_ml_odds_21 = archived_ml_odds_21[archived_ml_odds_21['date_game'] <= '2021-05-19']

# Then, the following playoff games must be excluded:
    # 2021-05-15: BOS v WAS
    # 2021-05-16: TBL v FLA, MIN v VGK, NYI v PIT
    # 2021-05-17: NSH v CAR, BOS v WAS, STL v COL
    # 2021-05-18: TBL v FLA, MIN v VGK, NYI v PIT
    # 2021-05-19: BOS v WAS, NSH v CAR, STL v COL, WPG v EDM

archived_ml_odds_21 = archived_ml_odds_21[~((archived_ml_odds_21['date_game'] == '2021-05-15') & (archived_ml_odds_21['team'].isin(['BOS', 'WSH'])))]
archived_ml_odds_21 = archived_ml_odds_21[~((archived_ml_odds_21['date_game'] == '2021-05-16') & (archived_ml_odds_21['team'].isin(['TBL', 'FLA', 'MIN', 'VGK', 'NYI', 'PIT'])))]
archived_ml_odds_21 = archived_ml_odds_21[~((archived_ml_odds_21['date_game'] == '2021-05-17') & (archived_ml_odds_21['team'].isin(['NSH', 'CAR', 'BOS', 'WSH', 'STL', 'COL'])))]
archived_ml_odds_21 = archived_ml_odds_21[~((archived_ml_odds_21['date_game'] == '2021-05-18') & (archived_ml_odds_21['team'].isin(['TBL', 'FLA', 'MIN', 'VGK', 'NYI', 'PIT'])))]
archived_ml_odds_21 = archived_ml_odds_21[~((archived_ml_odds_21['date_game'] == '2021-05-19') & (archived_ml_odds_21['team'].isin(['BOS', 'WSH', 'NSH', 'CAR', 'STL', 'COL', 'WPG', 'EDM'])))]

In [145]:
# Will combine and write to file later
archived_ml_odds_21

Unnamed: 0,date_game,team,ml_odds
1,2021-01-13,PIT,-110
2,2021-01-13,PHI,100
3,2021-01-13,MTL,132
4,2021-01-13,TOR,-145
5,2021-01-13,CHI,230
...,...,...,...
1734,2021-05-16,VAN,110
1747,2021-05-18,CGY,-135
1748,2021-05-18,VAN,115
1755,2021-05-19,VAN,145


### 2022 Season

In [146]:
# Read in table from web
response_22 = requests.get(betus_22, headers=headers)

In [147]:
# Get table from response
archived_odds_22 = pd.read_html(response_22.text)[0]

In [148]:
# Only keep required columns (for now we are only keeping moneyline, but we could also keep puckline and OU)
archived_ml_odds_22 = archived_odds_22.iloc[:,[0, 3, 8]].rename(columns={0:'date_game', 3:'team', 8:'ml_odds'}).drop(0)

In [149]:
# Convert team name to 3 letter code
archived_ml_odds_22['team'] = archived_ml_odds_22['team'].str.lower().replace(team_name_dict)

In [150]:
# Change date to standard format
# Function to correct the format of dates so they can easily be converted to date time type
def correct_dt_format_season_22(date):
    if len(date) == 3:
        return '0' + date + '22'
    elif len(date) == 4:
        return date + '21'
    else:
        return 'error'
    
# Use function
archived_ml_odds_22['date_game'] = archived_ml_odds_22['date_game'].apply(correct_dt_format_season_22)

# Convert to date time object type
archived_ml_odds_22['date_game'] = pd.to_datetime(archived_ml_odds_22['date_game'], format = '%m%d%y')

In [152]:
# Filter for only the regular season
archived_ml_odds_22 = archived_ml_odds_22[archived_ml_odds_22['date_game'] <= '2022-05-01']

In [153]:
archived_ml_odds_22

Unnamed: 0,date_game,team,ml_odds
1,2021-10-12,PIT,120
2,2021-10-12,TBL,-140
3,2021-10-12,SEA,235
4,2021-10-12,VGK,-290
5,2021-10-13,NYR,115
...,...,...,...
2620,2022-04-29,SEA,-120
2621,2022-04-29,NSH,-280
2622,2022-04-29,ARI,230
2623,2022-05-01,SEA,155


### Combine and write

In [155]:
# Combine
archived_ml_odds_comb = pd.concat([archived_ml_odds_21, archived_ml_odds_22], axis = 0)

In [None]:
# Ensure that odds column is an integer (there are 2 missing values for odds)
archived_ml_odds_comb['ml_odds'] = pd.to_numeric(archived_ml_odds_comb['ml_odds'], errors='coerce').astype('Int64')

In [156]:
# Save to file
archived_ml_odds_comb.to_csv('../data/odds/archived_betus_ml_odds.csv', header=True, index=False)