In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
  except RuntimeError as e:
    # Visible devices must be set at program startup
    print(e)




In [3]:
df = pd.read_csv('../data/combined_team_player_data.csv')

### Handle Text Data

In [4]:
df['positionCode'].unique()

array(['D', 'L', 'R', 'C'], dtype=object)

In [5]:
one_hot_encoded = pd.get_dummies(df['positionCode'], prefix='positionCode')

df = pd.concat([df, one_hot_encoded], axis=1)
df = df.drop('positionCode', axis=1)

one_hot_encoded = pd.get_dummies(df['homeRoad'], prefix='homeRoad')

df = pd.concat([df, one_hot_encoded], axis=1)
df = df.drop('homeRoad', axis=1)

one_hot_encoded = pd.get_dummies(df['homeRoad_opponent'], prefix='homeRoad_opponent')

df = pd.concat([df, one_hot_encoded], axis=1)
df = df.drop('homeRoad_opponent', axis=1)

### Reduce down to binary classes
We want to know if the player scores a powerplay point or not. It's quite rare for a player to score 2 or more (although it may be important information)

In [6]:
df['ppPoint_scored'] = df['ppPoints'].map(lambda x: 1 if x >= 1 else 0)
df.fillna(0, inplace=True)

### Feature Preparation

In [7]:
col_len = len(df.columns)
count = 0
for i in range(16):
    print(df.columns.to_list()[count:count+10])
    if count >= col_len:
        break
    count = count + 10


['gameId', 'goalsAgainst', 'goalsAgainstPerGame', 'goalsFor', 'goalsForPerGame', 'losses', 'otLosses', 'penaltyKillNetPct', 'penaltyKillPct', 'pointPct']
['powerPlayNetPct', 'powerPlayPct', 'regulationAndOtWins', 'shotsAgainstPerGame', 'shotsForPerGame', 'teamFullName', 'teamId', 'ties', 'wins', 'winsInRegulation']
['winsInShootout', 'powerPlayGoalsFor', 'ppGoalsPerGame', 'ppNetGoals', 'ppNetGoalsPerGame', 'ppOpportunities', 'ppOpportunitiesPerGame', 'shGoalsAgainst', 'shGoalsAgainstPerGame', 'benchMinorPenalties']
['gameMisconducts', 'majors', 'matchPenalties', 'minors', 'misconducts', 'netPenalties', 'netPenaltiesPer60', 'penalties', 'penaltiesDrawnPer60', 'penaltiesTakenPer60']
['penaltySecondsPerGame', 'totalPenaltiesDrawn', 'pkNetGoals', 'pkNetGoalsPerGame', 'pkTimeOnIcePerGame', 'pointsPct', 'ppGoalsAgainst', 'ppGoalsAgainstPerGame', 'shGoalsFor', 'shGoalsForPerGame']
['timesShorthanded', 'timesShorthandedPerGame', 'teamAbbrev', 'faceoffWinPct_opponent', 'gameDate_opponent', 'gam

In [8]:
to_remove = ['gameId', 'penaltyKillNetPct', 'powerPlayNetPct', 'regulationAndOtWins', 'teamFullName', 'teamId', 'ties', 'winsInShootout', 
             'ppOpportunitiesPerGame', 'shGoalsAgainstPerGame', 'netPenalties', 'netPenaltiesPer60', 'pkNetGoalsPerGame', 'opponentTeamAbbrev_opponent',
             'penaltyKillNetPct_opponent', 'regulationAndOtWins_opponent', 'teamFullName_opponent', 'teamId_opponent', 'ties_opponent', 'winsInShootout_opponent',
             'ppNetGoalsPerGame_opponent', 'ppOpportunitiesPerGame_opponent', 'shGoalsAgainstPerGame_opponent', 'netPenaltiesPer60_opponent',
             'netPenalties_opponent', 'pkNetGoalsPerGame_opponent', 'ppGoalsAgainstPerGame_opponent', 'shGoalsForPerGame_opponent', 'timesShorthandedPerGame_opponent',
             'teamAbbrev_opponent', 'lastName', 'playerId', 'pointsPerGame', 'shootsCatches', 'skaterFullName', 'gameDate', 'opponentTeamAbbrev', 
             'ppGoalsForPer60', 'ppIndividualSatForPer60', 'ppPointsPer60', 'ppPrimaryAssistsPer60', 'ppSecondaryAssistsPer60', 'ppShotsPer60', 'ppTimeOnIcePctPerGame',
             'ppTimeOnIcePerGame', 'goalsForPerGame', 'teamAbbrev', 'gameDate_opponent', 'gameDate', 'powerPlayGoalsFor', 'ppGoals'
            ]

print(len(to_remove))
all_cols = df.columns.to_list()

print(len(all_cols))
X_cols = list(set(all_cols) - set(to_remove))

X = df[X_cols]
print(len(X))
X.head()

51
162
426309


Unnamed: 0,goalsAgainst_opponent,ppShootingPct,shotsAgainstPerGame,shGoals,shotsForPerGame,powerPlayNetPct_opponent,goalsFor_opponent,wins_opponent,gamesPlayed_opponent,homeRoad_opponent_H,...,positionCode_R,shotsForPerGame_opponent,ppOpportunities,goalsAgainst,ppGoalsAgainst,losses_opponent,timesShorthanded,penaltyMinutes,penaltyKillPct,homeRoad_R
0,4,0.0,37.0,0,38.0,-0.25,3,0,1,True,...,False,37.0,7,3,0,1,4,2,1.0,True
1,4,0.333,37.0,0,38.0,-0.25,3,0,1,True,...,False,37.0,7,3,0,1,4,2,1.0,True
2,4,0.0,37.0,0,38.0,-0.25,3,0,1,True,...,False,37.0,7,3,0,1,4,0,1.0,True
3,4,0.0,37.0,0,38.0,-0.25,3,0,1,True,...,True,37.0,7,3,0,1,4,0,1.0,True
4,4,0.0,37.0,1,38.0,-0.25,3,0,1,True,...,False,37.0,7,3,0,1,4,0,1.0,True


### Handle Class Imbalance

In [9]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

target_variable = 'ppPoint_scored'

# Step 2: Inspect class distribution
print("Class distribution before SMOTE:")
print(df[target_variable].value_counts())

# Step 3: Split features and target variable
X = X.drop(target_variable, axis=1)
y = df[target_variable]

# Step 4: Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 5: Use the resampled dataset
# For example, you can split the resampled dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, shuffle=False)


ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\Logan\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)

In [None]:
X_train.head()

### Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LSTM
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
X_train_cols = X_train.columns.to_list()

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [None]:
callback = EarlyStopping(monitor='val_loss', patience=3)

In [None]:
# Reshaping for LSTM
T = 1
X_train = X_train.reshape(X_train.shape[0], T, X_train.shape[1])
X_train.shape

In [None]:
dropout_rate = 0.2

model = Sequential([
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(dropout_rate),
    # LSTM(128, input_shape=(X_train.shape[0], 1, X_train.shape[2])),
    LSTM(128, input_shape=(X_train.shape[0], T, X_train.shape[2])),
    # LSTM(128, activation='tanh'),
    BatchNormalization(),
    Dropout(dropout_rate),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(dropout_rate),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Precision(), 'accuracy'])
history = model.fit(X_train, y_train, epochs=10, batch_size=128, verbose=1, callbacks=[callback], validation_data=(X_test, y_test))

In [None]:
model.save('model.keras')

In [None]:
model = tf.keras.models.load_model('model.keras')

In [None]:
from sklearn.metrics import classification_report

y_pred_probs = model.predict(X_test)

# Adjust the threshold
threshold = 0.9 # You can experiment with different threshold values
y_pred_labels = (y_pred_probs > threshold).astype(int)

print(f'threshold of {threshold*100}%')
print(classification_report(y_test, y_pred_labels))

y_pred = model.predict(X_test) 
y_pred = np.round(y_pred).astype(int)
print('Regular threshold')
print(classification_report(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['precision'], label='Training Precision')
plt.plot(history.history['accuracy'], label='Training Accuracy')
# plt.plot(y_pred, label='Testing Precision')
# plt.plot(history.history['val_precision'], label='Validation Precision')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

### Player List Prediction

#### Get Averages of Players and Teams, then combine into 1 DataFrame

In [None]:
player_df = pd.read_csv('../data/database/player-database.csv')
team_df = pd.read_csv('../data/database/team-database.csv')

### Get the average of each players stats for the last 5 games, then take the last occurance 

In [None]:

# Assuming you have loaded your dataframe into a variable called 'nhl_stats_df'

# Sort the dataframe by player and date
nhl_stats_df = player_df.sort_values(by=['playerId', 'gameDate'])

# Define a function to calculate the rolling average for numeric columns
def calc_rolling_avg(group):
    return group.rolling(window=5, min_periods=1).mean()

# Split the dataframe into numeric and non-numeric parts
numeric_columns = nhl_stats_df.select_dtypes(include='number').columns
numeric_columns = numeric_columns.drop('gameId')
non_numeric_columns = [col for col in nhl_stats_df.columns if col not in numeric_columns]

# Calculate the rolling average for numeric columns for each player
rolling_avg_numeric_df = nhl_stats_df.groupby('playerId', group_keys=False)[numeric_columns].apply(calc_rolling_avg)

# Merge rolling average with non-numeric columns
rolling_avg_df = pd.concat([nhl_stats_df[non_numeric_columns], rolling_avg_numeric_df], axis=1)
player_df = rolling_avg_df.groupby('playerId').tail(1) ## get the last game for each player 
player_df.head()

### Get the average for each teams stats over the last 5 games, and return the last occurance

In [None]:

# Assuming you have loaded your dataframe into a variable called 'nhl_stats_df'

# Sort the dataframe by player and date
nhl_stats_df = team_df.sort_values(by=['teamId', 'gameDate'])

# Define a function to calculate the rolling average for numeric columns
def calc_rolling_avg(group):
    return group.rolling(window=5, min_periods=1).mean()

# Split the dataframe into numeric and non-numeric parts
numeric_columns = nhl_stats_df.select_dtypes(include='number').columns
numeric_columns = numeric_columns.drop('gameId')
non_numeric_columns = [col for col in nhl_stats_df.columns if col not in numeric_columns]

# Calculate the rolling average for numeric columns for each player
rolling_avg_numeric_df = nhl_stats_df.groupby('teamId', group_keys=False)[numeric_columns].apply(calc_rolling_avg)

# Merge rolling average with non-numeric columns
rolling_avg_df = pd.concat([nhl_stats_df[non_numeric_columns], rolling_avg_numeric_df], axis=1)
team_df = rolling_avg_df.groupby('teamId').tail(1) ## Get the last occurance of each teams stats and save them into the orginal team_df
team_df.head()

### Group player stats, team stats, and opposing team stats into one dataframe for prediction

In [None]:
nhl_teams = {
    'Anaheim Ducks': 'ANA',
    'Arizona Coyotes': 'ARI',
    'Boston Bruins': 'BOS',
    'Buffalo Sabres': 'BUF',
    'Calgary Flames': 'CGY',
    'Carolina Hurricanes': 'CAR',
    'Chicago Blackhawks': 'CHI',
    'Colorado Avalanche': 'COL',
    'Columbus Blue Jackets': 'CBJ',
    'Dallas Stars': 'DAL',
    'Detroit Red Wings': 'DET',
    'Edmonton Oilers': 'EDM',
    'Florida Panthers': 'FLA',
    'Los Angeles Kings': 'LAK',
    'Minnesota Wild': 'MIN',
    'Montréal Canadiens': 'MTL',
    'Nashville Predators': 'NSH',
    'New Jersey Devils': 'NJD',
    'New York Islanders': 'NYI',
    'New York Rangers': 'NYR',
    'Ottawa Senators': 'OTT',
    'Philadelphia Flyers': 'PHI',
    'Pittsburgh Penguins': 'PIT',
    'San Jose Sharks': 'SJS',
    'Seattle Kraken': 'SEA',
    'St. Louis Blues': 'STL',
    'Tampa Bay Lightning': 'TBL',
    'Toronto Maple Leafs': 'TOR',
    'Vancouver Canucks': 'VAN',
    'Vegas Golden Knights': 'VGK',
    'Washington Capitals': 'WSH',
    'Winnipeg Jets': 'WPG'
}

team_df['teamAbbrev'] = team_df['teamFullName'].map(nhl_teams)
team_df.head()

team_df = pd.merge(team_df, team_df, left_on=['teamAbbrev', 'gameId'], right_on=['opponentTeamAbbrev', 'gameId'], suffixes=('', '_opponent'))

df = pd.merge(team_df, player_df, on=['teamAbbrev', 'gameId'])
df = df.loc[:, ~df.columns.str.endswith('_x')]
df = df.apply(lambda x: x.replace('_y', '') if x.name.endswith('_y') else x)

for i in df.columns:
    if i.endswith('_y'):
        df.rename(columns={i: i.replace('_y', '')}, inplace=True)

df = df.fillna(0)
df = df.drop_duplicates()
df.head()

In [None]:
df['skaterFullName']

### Load betable players for prediction & match them with averaged stats dataframe

In [None]:
bum_df = pd.read_csv('../../../lib/ai_bum_list.csv') 
bum_df.head()

In [None]:
pred_df = df[df['skaterFullName'].isin(bum_df['skaterFullName'])]
pred_df

### Prepare the pred_df for prediction

In [None]:
to_remove = ['gameId', 'penaltyKillNetPct', 'powerPlayNetPct', 'regulationAndOtWins', 'teamFullName', 'teamId', 'ties', 'winsInShootout', 
             'ppOpportunitiesPerGame', 'shGoalsAgainstPerGame', 'netPenalties', 'netPenaltiesPer60', 'pkNetGoalsPerGame', 'opponentTeamAbbrev_opponent',
             'penaltyKillNetPct_opponent', 'regulationAndOtWins_opponent', 'teamFullName_opponent', 'teamId_opponent', 'ties_opponent', 'winsInShootout_opponent',
             'ppNetGoalsPerGame_opponent', 'ppOpportunitiesPerGame_opponent', 'shGoalsAgainstPerGame_opponent', 'netPenaltiesPer60_opponent',
             'netPenalties_opponent', 'pkNetGoalsPerGame_opponent', 'ppGoalsAgainstPerGame_opponent', 'shGoalsForPerGame_opponent', 'timesShorthandedPerGame_opponent',
             'teamAbbrev_opponent', 'lastName', 'pointsPerGame', 'shootsCatches', 'skaterFullName', 'opponentTeamAbbrev', 
             'ppGoalsForPer60', 'ppIndividualSatForPer60', 'ppPointsPer60', 'ppPrimaryAssistsPer60', 'ppSecondaryAssistsPer60', 'ppShotsPer60', 'ppTimeOnIcePctPerGame',
             'ppTimeOnIcePerGame', 'goalsForPerGame', 'teamAbbrev', 'gameDate_opponent', 'powerPlayGoalsFor', 'ppGoals', 'gameDate', 'playerId'
            ]

all_cols = pred_df.columns.to_list()
X_cols = list(set(all_cols) - set(to_remove))
input_df = pred_df[X_cols]

In [None]:
one_hot_encoded = pd.get_dummies(pred_df['positionCode'], prefix='positionCode')

input_df = pd.concat([input_df, one_hot_encoded], axis=1)
input_df = pred_df.drop('positionCode', axis=1)

one_hot_encoded = pd.get_dummies(input_df['homeRoad'], prefix='homeRoad')

input_df = pd.concat([input_df, one_hot_encoded], axis=1)
input_df = pred_df.drop('homeRoad', axis=1)

one_hot_encoded = pd.get_dummies(input_df['homeRoad_opponent'], prefix='homeRoad_opponent')

input_df = pd.concat([input_df, one_hot_encoded], axis=1)
input_df = input_df.drop('homeRoad_opponent', axis=1)

input_df.head()

In [None]:
# Re-index to match the model input
input_df = input_df.reindex(X_train_cols, axis=1)
input_df

In [None]:
input_df = scaler.transform(input_df)

### Prediction

In [None]:
preds = model.predict(input_df)
preds

In [None]:
input_df = scaler.inverse_transform(input_df)
input_df = pd.DataFrame(input_df, columns=X_train_cols)
pred_df['predictions'] = preds

In [None]:
pred_df