In [14]:
#access the website to scrape the current season's games and data
import requests
from bs4 import BeautifulSoup
import pandas as pd

#wesbite with current season's games and stats
url = 'https://www.naturalstattrick.com/games.php?fromseason=20232024&thruseason=20232024&stype=2&sit=all&loc=B&team=All&rate=n'

response = requests.get(url)

#use beautiful soup to parse the webpage's content
soup = BeautifulSoup(response.content, 'html.parser')

# Find the table
table = soup.find('table')

# Read the table using pandas
df = pd.read_html(str(table))[0]
#drop all columns with a %
cols_to_drop = [col for col in df.columns if '%' in col]
    
#drop other unneccesary columns
df.drop(columns=cols_to_drop, inplace=True)
df.drop('Unnamed: 2', axis=1, inplace=True)
df.drop('Attendance', axis=1, inplace=True)
df.drop('TOI', axis=1, inplace=True)
df.to_csv('23_24_Games/23_24_games.csv', index=False)

df.head()


  df = pd.read_html(str(table))[0]


Unnamed: 0,Game,Team,CF,CA,FF,FA,SF,SA,GF,GA,...,MDSA,MDGF,MDGA,LDCF,LDCA,LDSF,LDSA,LDGF,LDGA,PDO
0,"2023-10-10 - Predators 3, Lightning 5",Nashville Predators,65,61,46,50,31,34,3,5,...,7,0,1,34,45,13,18,1,1,0.95
1,"2023-10-10 - Predators 3, Lightning 5",Tampa Bay Lightning,61,65,50,46,34,31,5,3,...,6,1,0,45,34,18,13,1,1,1.05
2,"2023-10-10 - Blackhawks 4, Penguins 2",Chicago Blackhawks,73,71,54,55,36,41,4,2,...,10,1,0,36,50,12,18,0,1,1.062
3,"2023-10-10 - Blackhawks 4, Penguins 2",Pittsburgh Penguins,71,73,55,54,41,36,2,4,...,10,0,1,50,36,18,12,1,0,0.938
4,"2023-10-10 - Kraken 1, Golden Knights 4",Seattle Kraken,71,61,48,42,33,28,1,4,...,6,0,1,33,32,13,9,0,0,0.887


In [15]:
import pandas as pd

# use the mapping dictionary to replace the team names with the abbreviations
abbreviations_df = pd.read_csv('teams.csv')
mapping_dict = dict(zip(abbreviations_df['Team'], abbreviations_df['Abbrv']))

def update_and_overwrite_file(file_path, mapping_dict):
    df = pd.read_csv(file_path)
    df['Team'] = df['Team'].map(mapping_dict)
    df.to_csv(file_path, index=False)  

file_paths = ['23_24_Games/23_24_games.csv']

for file_path in file_paths:
    update_and_overwrite_file(file_path, mapping_dict)

In [16]:
import pandas as pd
import re

# use the mapping dictionary to replace the team names in the game string with the abbreviations
abbreviations_df = pd.read_csv('teams.csv')
mapping_dict = dict(zip(abbreviations_df['Name'], abbreviations_df['Abbrv']))

def replace_team_names(game_string, mapping_dict):
    pattern = r'(\d{4}-\d{2}-\d{2}) - ([\w\s]+) (\d+), ([\w\s]+) (\d+)'
    match = re.match(pattern, game_string)

    if match:
        date, team1, score1, team2, score2 = match.groups()
        team1_abbr = mapping_dict.get(team1.strip(), team1)
        team2_abbr = mapping_dict.get(team2.strip(), team2) 
        return f"{date} - {team1_abbr} {score1}, {team2_abbr} {score2}"
    else:
        return game_string

def update_and_overwrite_file(file_path, mapping_dict):
    df = pd.read_csv(file_path)
    df['Game'] = df['Game'].apply(lambda x: replace_team_names(x, mapping_dict))
    df.to_csv(file_path, index=False)

file_paths = ['23_24_Games/23_24_games.csv']

for file_path in file_paths:
    update_and_overwrite_file(file_path, mapping_dict)

In [18]:
import os
import pandas as pd


input_directory = "C:/Users/luken/Desktop/NHL_Model/23_24_Games/"


base_output_directory = "C:/Users/luken/Desktop/NHL_Model/23_24_Team"
#iterate through the files and create new files for each team's games for each year
for file_name in os.listdir(input_directory):
    if file_name.endswith(".csv"):
        year = file_name.split('_games.csv')[0]

        year_directory = base_output_directory
        if not os.path.exists(year_directory):
            os.makedirs(year_directory)

        file_path = os.path.join(input_directory, file_name)
        df = pd.read_csv(file_path)
        pattern = r'(\d{4}-\d{2}-\d{2}) - ([\w\s]+) (\d+), ([\w\s]+) (\d+)'
        df[['Date', 'AwayTeam', 'AwayScore', 'HomeTeam', 'HomeScore']] = df['Game'].str.extract(pattern)
        df['HomeResult'] = 'Draw'
        df.loc[df['HomeScore'] > df['AwayScore'], 'HomeResult'] = 'Won'
        df.loc[df['HomeScore'] < df['AwayScore'], 'HomeResult'] = 'Lost'
        

#sort each team's games into a it's own file for the current season
        team_subsets = {team: df[df['Team'] == team] for team in df['Team'].unique()}

        for team, subset in team_subsets.items():
            team_file_name = f"{team}.csv"
            team_file_path = os.path.join(year_directory, team_file_name)
            subset.to_csv(team_file_path, index=False)


CSV files organized by year and saved in C:/Users/luken/Desktop/NHL_Model/23_24_Team


In [19]:
#calculate days of rest
dir_path = "C:/Users/luken/Desktop/NHL_Model/23_24_Team"


all_files = [f for f in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, f)) and f.endswith('.csv')]

for file in all_files:
    file_path = os.path.join(dir_path, file)
    df = pd.read_csv(file_path)
    if 'Date' in df.columns:
        date_col = 'Date'
    df[date_col] = pd.to_datetime(df[date_col])

    df['time_diff'] = df[date_col].diff()
    df['days_of_rest'] = df['time_diff'].dt.days - 1
    df.drop('time_diff', axis=1, inplace=True)
    df.to_csv(file_path, index=False)



Processed ANA.csv
Processed ARI.csv
Processed BOS.csv
Processed BUF.csv
Processed CAR.csv
Processed CBJ.csv
Processed CGY.csv
Processed CHI.csv
Processed COL.csv
Processed DAL.csv
Processed DET.csv
Processed EDM.csv
Processed FLA.csv
Processed LA.csv
Processed MIN.csv
Processed MTL.csv
Processed NJ.csv
Processed NSH.csv
Processed NYI.csv
Processed NYR.csv
Processed OTT.csv
Processed PHI.csv
Processed PIT.csv
Processed SEA.csv
Processed SJ.csv
Processed STL.csv
Processed TB.csv
Processed TOR.csv
Processed VAN.csv
Processed VGK.csv
Processed WPG.csv
Processed WSH.csv


In [7]:
#this cell is meant to be run to sort the current season's data and calculate the rolling averages to be able to put into the model and track the results so far this season
import os
import pandas as pd

def calculate_rolling_average(df, columns, window=10):
    rolling_df = df[columns].rolling(window=window, min_periods=10).mean().shift(1)
    return rolling_df
base_path = '23_24_Team'
average_data_path = '23_24_average_data/'
os.makedirs(average_data_path, exist_ok=True)

for team_file in os.listdir(base_path):
    if team_file.endswith('.csv'):
        team_path = os.path.join(base_path, team_file)
        df = pd.read_csv(team_path)
        selected_columns = ['Game', 'Team', 'Date', 'AwayTeam', 'AwayScore', 'HomeTeam', 'HomeScore', 'HomeResult', 'days_of_rest']
        other_columns = df.columns.difference(selected_columns)
        rolling_df = calculate_rolling_average(df, other_columns)

        combined_df = pd.concat([df[selected_columns], rolling_df], axis=1)
        cols_to_drop = ['Team','AwayTeam', 'AwayScore', 'HomeTeam', 'HomeScore', 'HomeResult']

        combined_df.drop(columns=cols_to_drop, inplace=True)
        processed_file_path = os.path.join(average_data_path, f'{team_file}')
        combined_df.to_csv(processed_file_path, index=False)