In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only use the first GPU
  try:
    tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
  except RuntimeError as e:
    # Visible devices must be set at program startup
    print(e)

gpus

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
player_df = pd.read_csv('../data/player_data.csv')
team_df = pd.read_csv('../data/team_data.csv')

In [None]:
player_df['ppPoint_scored'] = player_df['ppPoints'].map(lambda x: 1 if x >= 1 else 0)

In [None]:
# df = df[:len(df)//2]

### Caculate the rolling average of each player over their last 10 games

In [None]:
# Sort the dataframe by player and date
nhl_stats_df = player_df.sort_values(by=['playerId', 'gameDate'])

# Define a function to calculate the rolling average for numeric columns
def calc_rolling_avg(group):
    return group.rolling(window=10, min_periods=1).mean()

# Split the dataframe into numeric and non-numeric parts
numeric_columns = nhl_stats_df.select_dtypes(include='number').columns
numeric_columns = numeric_columns.drop('gameId')

# Don't include the target column in the rolling average
numeric_columns = numeric_columns.drop('ppPoint_scored')
non_numeric_columns = [col for col in nhl_stats_df.columns if col not in numeric_columns]

# Calculate the rolling average for numeric columns for each player
rolling_avg_numeric_df = nhl_stats_df.groupby('playerId', group_keys=False)[numeric_columns].apply(calc_rolling_avg)

# Merge rolling average with non-numeric columns
player_rolling_avg = pd.concat([nhl_stats_df[non_numeric_columns], rolling_avg_numeric_df], axis=1)

# Sort the dataframe by date again to have the original order
player_df = player_rolling_avg.sort_values('gameDate')
player_df

### Caculate the rolling average of each team over their last 10 games

In [None]:
# Sort the dataframe by player and date
nhl_stats_df = team_df.sort_values(by=['teamId', 'gameDate'])

# Define a function to calculate the rolling average for numeric columns
def calc_rolling_avg(group):
    return group.rolling(window=10, min_periods=1).mean()

# Split the dataframe into numeric and non-numeric parts
numeric_columns = nhl_stats_df.select_dtypes(include='number').columns
numeric_columns = numeric_columns.drop('gameId')
non_numeric_columns = [col for col in nhl_stats_df.columns if col not in numeric_columns]

# Calculate the rolling average for numeric columns for each player
rolling_avg_numeric_df = nhl_stats_df.groupby('teamId', group_keys=False)[numeric_columns].apply(calc_rolling_avg)

# Merge rolling average with non-numeric columns
rolling_avg_df = pd.concat([nhl_stats_df[non_numeric_columns], rolling_avg_numeric_df], axis=1)
team_df = rolling_avg_df.sort_values('gameDate')
team_df

### Combine Player and Team into one DataFrame (see data_preproc for details)

In [None]:
nhl_teams = {
        'Anaheim Ducks': 'ANA',
        'Arizona Coyotes': 'ARI',
        'Boston Bruins': 'BOS',
        'Buffalo Sabres': 'BUF',
        'Calgary Flames': 'CGY',
        'Carolina Hurricanes': 'CAR',
        'Chicago Blackhawks': 'CHI',
        'Colorado Avalanche': 'COL',
        'Columbus Blue Jackets': 'CBJ',
        'Dallas Stars': 'DAL',
        'Detroit Red Wings': 'DET',
        'Edmonton Oilers': 'EDM',
        'Florida Panthers': 'FLA',
        'Los Angeles Kings': 'LAK',
        'Minnesota Wild': 'MIN',
        'Montréal Canadiens': 'MTL',
        'Nashville Predators': 'NSH',
        'New Jersey Devils': 'NJD',
        'New York Islanders': 'NYI',
        'New York Rangers': 'NYR',
        'Ottawa Senators': 'OTT',
        'Philadelphia Flyers': 'PHI',
        'Pittsburgh Penguins': 'PIT',
        'San Jose Sharks': 'SJS',
        'Seattle Kraken': 'SEA',
        'St. Louis Blues': 'STL',
        'Tampa Bay Lightning': 'TBL',
        'Toronto Maple Leafs': 'TOR',
        'Vancouver Canucks': 'VAN',
        'Vegas Golden Knights': 'VGK',
        'Washington Capitals': 'WSH',
        'Winnipeg Jets': 'WPG'
}

team_df['teamAbbrev'] = team_df['teamFullName'].map(nhl_teams)
team_df = pd.merge(team_df, team_df, left_on=['teamAbbrev', 'gameId'], right_on=['opponentTeamAbbrev', 'gameId'], suffixes=('', '_opponent'))

df = pd.merge(team_df, player_df, on=['teamAbbrev', 'gameId'], how='left')
df = df.loc[:, ~df.columns.str.endswith('_x')]

df = df.apply(lambda x: x.replace('_y', '') if x.name.endswith('_y') else x)

for i in df.columns:
    if i.endswith('_y'):
        df.rename(columns={i: i.replace('_y', '')}, inplace=True)

df = df.fillna(0)
df = df.drop_duplicates()

# PHX was used in about 1,400 rows; Update to combine into ARI
df['teamAbbrev'] = df['teamAbbrev'].map(lambda x: 'ARI' if x == 'PHX' else x)
df['opponentTeamAbbrev'] = df['opponentTeamAbbrev'].map(lambda x: 'ARI' if x == 'PHX' else x)

# There are about 94,000 duplicates that don't get detected; remove them
df = df.drop_duplicates(subset=['playerId', 'gameId'], keep='first')
# run once
# df.to_csv('../data/combined-dataset.csv', index=False)

In [None]:
df

In [None]:
one_hot_encoded = pd.get_dummies(df['positionCode'], prefix='positionCode')

df = pd.concat([df, one_hot_encoded], axis=1)
df = df.drop('positionCode', axis=1)

one_hot_encoded = pd.get_dummies(df['homeRoad'], prefix='homeRoad')

df = pd.concat([df, one_hot_encoded], axis=1)
df = df.drop('homeRoad', axis=1)

one_hot_encoded = pd.get_dummies(df['homeRoad_opponent'], prefix='homeRoad_opponent')

df = pd.concat([df, one_hot_encoded], axis=1)
df = df.drop('homeRoad_opponent', axis=1)

In [None]:
df.shape

### Reduce down to binary classes
We want to know if the player scores a powerplay point or not. It's quite rare for a player to score 2 or more (although it may be important information)

In [None]:
# df['ppPoint_scored'] = df['ppPoints'].map(lambda x: 1 if x >= 1 else 0)
df.fillna(0, inplace=True)

### Feature Preparation

In [None]:
# col_len = len(df.columns)
# count = 0
# for i in range(16):
#     print(df.columns.to_list()[count:count+10])
#     if count >= col_len:
#         break
#     count = count + 10


In [None]:
to_remove = ['gameId', 'penaltyKillNetPct', 'powerPlayNetPct', 'regulationAndOtWins', 'teamFullName', 'teamId', 'ties', 'winsInShootout', 
             'ppOpportunitiesPerGame', 'shGoalsAgainstPerGame', 'netPenalties', 'netPenaltiesPer60', 'pkNetGoalsPerGame', 'opponentTeamAbbrev_opponent',
             'penaltyKillNetPct_opponent', 'regulationAndOtWins_opponent', 'teamFullName_opponent', 'teamId_opponent', 'ties_opponent', 'winsInShootout_opponent',
             'ppNetGoalsPerGame_opponent', 'ppOpportunitiesPerGame_opponent', 'shGoalsAgainstPerGame_opponent', 'netPenaltiesPer60_opponent',
             'netPenalties_opponent', 'pkNetGoalsPerGame_opponent', 'ppGoalsAgainstPerGame_opponent', 'shGoalsForPerGame_opponent', 'timesShorthandedPerGame_opponent',
             'teamAbbrev_opponent', 'lastName', 'playerId', 'pointsPerGame', 'shootsCatches', 'skaterFullName', 'gameDate', 'opponentTeamAbbrev', 
             'ppGoalsForPer60', 'ppIndividualSatForPer60', 'ppPointsPer60', 'ppPrimaryAssistsPer60', 'ppSecondaryAssistsPer60', 'ppShotsPer60', 'ppTimeOnIcePctPerGame',
             'ppTimeOnIcePerGame', 'goalsForPerGame', 'teamAbbrev', 'gameDate_opponent', 'gameDate', 'powerPlayGoalsFor', 'ppGoals'
            ]

to_keep = ['penaltyKillPct', 'powerPlayPct', 'powerPlayGoalsFor', 'ppGoalsPerGame', 'ppOpportunitiesPerGame', 'penaltyKillPct_opponent', 'netPenaltiesPer60_opponent', 'assists',
            'evPoints', 'gamesPlayed', 'ppPoints', 'ppTimeOnIcePerGame', 'ppPoint_scored']

# cols = set(df.columns.to_list()) - set(to_keep)
# print(cols)
# print(len(to_remove))
# all_cols = df.columns.to_list()

# print(len(all_cols))

# X_cols = list(set(all_cols) - set(to_remove))

# X = df[X_cols]
# print(len(X))
df = df[to_keep]
df

In [None]:
X_train_cols = to_keep

### Handle Class Imbalance

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

target_variable = 'ppPoint_scored'

# Step 2: Inspect class distribution
print("Class distribution before SMOTE:")
print(df[target_variable].value_counts())

# Step 3: Split features and target variable
X = df.drop(target_variable, axis=1)
y = df[target_variable]

# Step 4: Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 5: Use the resampled dataset
# For example, you can split the resampled dataset into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, shuffle=False)

# Combine the features and target variable
df = pd.DataFrame(X_resampled, columns=X.columns)
df[target_variable] = y_resampled

### Remove the first 100,000 rows (for speedy proccessing)

In [None]:
# skip_size = 200_000
# df = X[skip_size:].astype('float32')
df = df.astype('float32')
del X

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df['ppPoint_scored'].value_counts()

In [None]:
df.loc[df['ppPoint_scored'] == 1]

In [None]:
sample = df.iloc[0]
sample

### LSTM Reshaping

In [None]:
T = 10
X = []
Y = []

### Reshaping The Data for LSTM
for t in range(len(df) - T):
  # Take all rows except the last one (label)
  x = df.iloc[t:t + T]
  x = x.drop('ppPoint_scored', axis=1)
  X.append(x.values)

  # Take only the label
  y = df.iloc[t + T]['ppPoint_scored']
  Y.append(y)

X = np.array(X)
y = np.array(Y)

N = len(X)
print("X.shape", X.shape, "y.shape", y.shape)

### Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LSTM
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

### Scaling
Unzip the LSTM, scale X_test and X_train, zip up again into LSTM shape for training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2, shuffle=True)

In [None]:
import pickle as pkl

train_batch, train_seq, train_features = X_train.shape
test_batch, test_seq, test_features = X_test.shape

# Reshape 
X_train = X_train.reshape(train_batch * train_seq, train_features)
X_test = X_test.reshape(test_batch * test_seq, test_features)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape back into LSTM
X_train = X_train.reshape(train_batch, train_seq, train_features)
X_test = X_test.reshape(test_batch, test_seq, test_features)

pkl.dump(scaler, open('../models/scaler.pkl', 'wb'))

del train_batch, train_seq, train_features, test_batch, test_seq, test_features

In [None]:
# X_train_cols = X_train.columns.to_list()

In [None]:
callback = EarlyStopping(monitor='val_loss', patience=3)

In [None]:
X_test.shape

In [None]:
# X_train, X_test, y_train, y_test = X_train.astype('float32'), X_test.astype('float32'), y_train.astype('float32'), y_test.astype('float32')

In [None]:
dropout_rate = 0.2

model = Sequential([
    # Dense(128, activation='relu'),
    LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2])),
    # Dense(128, activation='relu'),
    # Dropout(dropout_rate),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Precision(), 'accuracy'])
history = model.fit(X_train, y_train, epochs=3, batch_size=128, verbose=1, callbacks=[callback], validation_data=(X_test, y_test))

In [None]:
model.save('model.keras')

In [None]:
model = tf.keras.models.load_model('model.keras')

### Model Testing

In [None]:
from sklearn.metrics import classification_report

# y_pred_probs = model.predict(X_test)

# # Adjust the threshold
# threshold = 0.9 # You can experiment with different threshold values
# y_pred_labels = (y_pred_probs > threshold).astype(int)

# print(f'threshold of {threshold*100}%')
# print(classification_report(y_test, y_pred_labels))

y_pred = model.predict(X_test) 
y_pred = np.round(y_pred).astype(int)
print('Regular threshold')
print(classification_report(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['precision'], label='Training Precision')
plt.plot(history.history['accuracy'], label='Training Accuracy')
# plt.plot(y_pred, label='Testing Precision')
# plt.plot(history.history['val_precision'], label='Validation Precision')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

### Player List Prediction

In [3]:
import pandas as pd
import numpy as np

#### Get Averages of Players and Teams, then combine into 1 DataFrame

In [4]:
player_df = pd.read_csv('../data/database/player-database.csv')
team_df = pd.read_csv('../data/database/team-database.csv')

In [5]:
nhl_teams = {
    'Anaheim Ducks': 'ANA',
    'Arizona Coyotes': 'ARI',
    'Boston Bruins': 'BOS',
    'Buffalo Sabres': 'BUF',
    'Calgary Flames': 'CGY',
    'Carolina Hurricanes': 'CAR',
    'Chicago Blackhawks': 'CHI',
    'Colorado Avalanche': 'COL',
    'Columbus Blue Jackets': 'CBJ',
    'Dallas Stars': 'DAL',
    'Detroit Red Wings': 'DET',
    'Edmonton Oilers': 'EDM',
    'Florida Panthers': 'FLA',
    'Los Angeles Kings': 'LAK',
    'Minnesota Wild': 'MIN',
    'Montréal Canadiens': 'MTL',
    'Nashville Predators': 'NSH',
    'New Jersey Devils': 'NJD',
    'New York Islanders': 'NYI',
    'New York Rangers': 'NYR',
    'Ottawa Senators': 'OTT',
    'Philadelphia Flyers': 'PHI',
    'Pittsburgh Penguins': 'PIT',
    'San Jose Sharks': 'SJS',
    'Seattle Kraken': 'SEA',
    'St. Louis Blues': 'STL',
    'Tampa Bay Lightning': 'TBL',
    'Toronto Maple Leafs': 'TOR',
    'Vancouver Canucks': 'VAN',
    'Vegas Golden Knights': 'VGK',
    'Washington Capitals': 'WSH',
    'Winnipeg Jets': 'WPG'
}

team_df['teamAbbrev'] = team_df['teamFullName'].map(nhl_teams)

In [6]:
og = team_df['teamAbbrev'].unique()
op = team_df['opponentTeamAbbrev'].unique()
len(op)

32

In [7]:
team_df[team_df['opponentTeamAbbrev'].isin(og)]

Unnamed: 0,faceoffWinPct,gameDate,gameId,gamesPlayed,goalsAgainst,goalsAgainstPerGame,goalsFor,goalsForPerGame,homeRoad,losses,...,pkNetGoalsPerGame,pkTimeOnIcePerGame,pointsPct,ppGoalsAgainst,ppGoalsAgainstPerGame,shGoalsFor,shGoalsForPerGame,timesShorthanded,timesShorthandedPerGame,teamAbbrev
0,0.677966,2023-10-10,2023020002,1,4,4.0,2,2.0,H,1,...,0.0,480.0,0.0,0,0.0,0,0.0,4,4.0,PIT
1,0.322033,2023-10-10,2023020002,1,2,2.0,4,4.0,R,0,...,0.0,240.0,1.0,0,0.0,0,0.0,2,2.0,CHI
2,0.591836,2023-10-10,2023020003,1,1,1.0,4,4.0,H,0,...,0.0,660.0,1.0,0,0.0,0,0.0,4,4.0,VGK
3,0.408163,2023-10-10,2023020003,1,4,4.0,1,1.0,R,1,...,0.0,480.0,0.0,0,0.0,0,0.0,4,4.0,SEA
4,0.566666,2023-10-10,2023020001,1,3,3.0,5,5.0,H,0,...,-1.0,462.0,1.0,1,1.0,0,0.0,4,4.0,TBL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2619,0.418604,2024-04-18,2023021309,1,1,1.0,5,5.0,H,0,...,-1.0,214.0,1.0,1,1.0,0,0.0,2,2.0,CGY
2620,0.327272,2024-04-18,2023021310,1,1,1.0,5,5.0,H,0,...,-1.0,219.0,1.0,1,1.0,0,0.0,2,2.0,COL
2621,0.588235,2024-04-18,2023021307,1,3,3.0,4,4.0,R,0,...,0.0,373.0,1.0,2,2.0,2,2.0,4,4.0,SEA
2622,0.638297,2024-04-18,2023021312,1,4,4.0,5,5.0,H,0,...,-1.0,92.0,1.0,1,1.0,0,0.0,1,1.0,LAK


In [8]:
player_df

Unnamed: 0,assists,evGoals,evPoints,faceoffWinPct,gameWinningGoals,gamesPlayed,goals,lastName,otGoals,penaltyMinutes,...,ppPrimaryAssistsPer60,ppSecondaryAssists,ppSecondaryAssistsPer60,ppShootingPct,ppShots,ppShotsPer60,ppTimeOnIce,ppTimeOnIcePctPerGame,ppTimeOnIcePerGame,teamAbbrev
0,3,0,1,0.00000,0,1,0,Point,0,0,...,8.823,1,8.823,0.000,1,8.823,408,0.822,408.0,TBL
1,0,2,2,,0,1,2,Kucherov,0,2,...,,0,,0.000,2,17.690,407,0.820,407.0,TBL
2,1,0,0,0.70588,0,1,0,Stamkos,0,2,...,0.000,1,8.955,0.000,2,17.910,402,0.810,402.0,TBL
3,1,0,1,0.27272,1,1,2,Paul,0,0,...,0.000,0,0.000,0.666,3,27.692,390,0.786,390.0,TBL
4,1,0,1,,0,1,0,Hedman,0,0,...,0.000,0,0.000,0.000,1,9.326,386,0.778,386.0,TBL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47216,0,0,0,,0,1,0,Zellweger,0,0,...,0.000,0,0.000,0.000,0,0.000,0,0.000,0.0,ANA
47217,1,0,1,0.00000,0,1,0,Gauthier,0,0,...,0.000,0,0.000,0.000,0,0.000,0,0.000,0.0,ANA
47218,0,0,0,,0,1,0,Ohgren,0,0,...,0.000,0,0.000,0.000,0,0.000,0,0.000,0.0,MIN
47219,0,1,1,,0,1,1,Klapka,0,5,...,0.000,0,0.000,0.000,0,0.000,0,0.000,0.0,CGY


### Get the average of each players stats for the last 5 games, then take the last occurance 

In [9]:

# Assuming you have loaded your dataframe into a variable called 'nhl_stats_df'

# Sort the dataframe by player and date
nhl_stats_df = player_df.sort_values(by=['playerId', 'gameDate'])

# Define a function to calculate the rolling average for numeric columns
def calc_rolling_avg(group):
    return group.rolling(window=5, min_periods=1).mean()

# Split the dataframe into numeric and non-numeric parts
numeric_columns = nhl_stats_df.select_dtypes(include='number').columns
numeric_columns = numeric_columns.drop('gameId')
non_numeric_columns = [col for col in nhl_stats_df.columns if col not in numeric_columns]

# Calculate the rolling average for numeric columns for each player
rolling_avg_numeric_df = nhl_stats_df.groupby('playerId', group_keys=False)[numeric_columns].apply(calc_rolling_avg)

# Merge rolling average with non-numeric columns
rolling_avg_df = pd.concat([nhl_stats_df[non_numeric_columns], rolling_avg_numeric_df], axis=1)

## get the last game for each player 
player_df = rolling_avg_df.groupby('playerId').tail(1)
player_df

Unnamed: 0,lastName,positionCode,shootsCatches,skaterFullName,gameDate,gameId,homeRoad,opponentTeamAbbrev,teamAbbrev,assists,...,ppPrimaryAssists,ppPrimaryAssistsPer60,ppSecondaryAssists,ppSecondaryAssistsPer60,ppShootingPct,ppShots,ppShotsPer60,ppTimeOnIce,ppTimeOnIcePctPerGame,ppTimeOnIcePerGame
46952,Suter,D,L,Ryan Suter,2024-04-17,2023021305,H,STL,DAL,0.0,...,0.0,0.0000,0.0,0.000,0.0,0.0,0.0000,4.2,0.01680,4.2
46905,Carter,C,R,Jeff Carter,2024-04-17,2023021303,R,NYI,PIT,0.0,...,0.0,,0.0,,0.2,0.4,16.5226,78.8,0.32440,78.8
47144,Parise,L,L,Zach Parise,2024-04-18,2023021310,H,EDM,COL,0.0,...,0.0,0.0000,0.0,0.000,0.0,0.6,67.7342,17.6,0.06240,17.6
46618,Burns,D,R,Brent Burns,2024-04-16,2023021296,R,CBJ,CAR,0.6,...,0.4,22.7885,0.0,0.000,0.0,0.6,13.6718,145.6,0.50480,145.6
47059,Perry,R,R,Corey Perry,2024-04-18,2023021310,R,COL,EDM,0.2,...,0.0,0.0000,0.0,0.000,0.0,0.0,0.0000,54.8,0.32125,54.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46653,Smejkal,L,L,Jiri Smejkal,2024-04-16,2023021295,R,BOS,OTT,0.0,...,0.0,,0.0,,0.0,0.2,6.9232,83.6,0.23220,83.6
6863,Matinpalo,D,R,Nikolas Matinpalo,2023-11-08,2023020190,R,TOR,OTT,0.0,...,0.0,0.0000,0.0,0.000,0.0,0.0,0.0000,1.5,0.00400,1.5
27785,Merela,C,R,Waltteri Merela,2024-01-27,2023020767,H,NJD,TBL,0.0,...,0.0,0.0000,0.0,0.000,0.0,0.0,0.0000,0.0,0.00000,0.0
36135,Koch,D,L,Patrik Koch,2024-03-07,2023021001,H,MIN,ARI,0.0,...,0.0,0.0000,0.0,0.000,0.0,0.0,0.0000,0.0,0.00000,0.0


In [10]:
player_df[player_df['opponentTeamAbbrev'] == 'NYR']

Unnamed: 0,lastName,positionCode,shootsCatches,skaterFullName,gameDate,gameId,homeRoad,opponentTeamAbbrev,teamAbbrev,assists,...,ppPrimaryAssists,ppPrimaryAssistsPer60,ppSecondaryAssists,ppSecondaryAssistsPer60,ppShootingPct,ppShots,ppShotsPer60,ppTimeOnIce,ppTimeOnIcePctPerGame,ppTimeOnIcePerGame
13364,Benning,D,R,Matt Benning,2023-12-03,2023020370,R,NYR,SJS,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43725,Walman,D,L,Jake Walman,2024-04-05,2023021213,H,NYR,DET,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.8,0.1088,25.8
46499,Imama,L,L,Bokondji Imama,2024-04-15,2023021292,R,NYR,OTT,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44403,White,C,R,Colin White,2024-04-07,2023021234,R,NYR,MTL,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.6,0.0194,11.6
28525,Sergachev,D,L,Mikhail Sergachev,2024-02-07,2023020791,R,NYR,TBL,0.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,77.6,0.3062,77.6
32309,Bastian,R,R,Nathan Bastian,2024-02-22,2023020891,H,NYR,NJD,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.8,0.0118,3.8
28858,Pitlick,C,L,Rem Pitlick,2024-02-09,2023020802,H,NYR,CHI,0.0,...,0.0,,0.0,,0.0,0.4,12.0324,127.6,0.3754,127.6
13209,Foudy,C,L,Liam Foudy,2023-12-02,2023020358,H,NYR,NSH,0.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.8,0.0222,10.8
40399,Bjornfot,D,L,Tobias Bjornfot,2024-03-23,2023021119,R,NYR,FLA,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33794,Johnson,C,L,Kent Johnson,2024-02-28,2023020939,R,NYR,CBJ,0.0,...,0.0,,0.0,,0.0,0.4,10.9538,126.6,0.4806,126.6


# THE PROBLEM

The code below will return the last game of each of the 32 teams, but some opponents occur multiple times, and we miss out on 6 teams.

This will cut the data in half after the team_df merge on itself because lot's of games don't exist.

This is a piece of shit; going to redo the whole thing and go with a cumulative method

### Get the average for each teams stats over the last 5 games, and return the last occurance

In [11]:
# Sort the dataframe by player and date
nhl_stats_df = team_df.sort_values(by=['teamId', 'gameDate'])

# Define a function to calculate the rolling average for numeric columns
def calc_rolling_avg(group):
    return group.rolling(window=5, min_periods=1).mean()

# Split the dataframe into numeric and non-numeric parts
numeric_columns = nhl_stats_df.select_dtypes(include='number').columns
numeric_columns = numeric_columns.drop('gameId')
non_numeric_columns = [col for col in nhl_stats_df.columns if col not in numeric_columns]

# Calculate the rolling average for numeric columns for each player
rolling_avg_numeric_df = nhl_stats_df.groupby('teamId', group_keys=False)[numeric_columns].apply(calc_rolling_avg)

# Merge rolling average with non-numeric columns
rolling_avg_df = pd.concat([nhl_stats_df[non_numeric_columns], rolling_avg_numeric_df], axis=1)
## Get the last occurance of each teams stats and save them into the orginal team_df
team_df = rolling_avg_df.groupby('teamId').tail(1)
team_df

Unnamed: 0,gameDate,gameId,homeRoad,opponentTeamAbbrev,teamFullName,teamAbbrev,faceoffWinPct,gamesPlayed,goalsAgainst,goalsAgainstPerGame,...,pkNetGoals,pkNetGoalsPerGame,pkTimeOnIcePerGame,pointsPct,ppGoalsAgainst,ppGoalsAgainstPerGame,shGoalsFor,shGoalsForPerGame,timesShorthanded,timesShorthandedPerGame
2584,2024-04-15,2023021288,H,NYI,New Jersey Devils,NJD,0.502202,1.0,3.4,3.4,...,-0.4,-0.4,259.8,0.3,0.4,0.4,0.0,0.0,2.8,2.8
2611,2024-04-17,2023021303,H,PIT,New York Islanders,NYI,0.475678,1.0,2.2,2.2,...,-0.8,-0.8,174.0,0.9,0.8,0.8,0.0,0.0,1.8,1.8
2573,2024-04-15,2023021292,H,OTT,New York Rangers,NYR,0.536586,1.0,2.4,2.4,...,0.4,0.4,314.0,0.6,0.0,0.0,0.4,0.4,2.8,2.8
2602,2024-04-16,2023021299,H,WSH,Philadelphia Flyers,PHI,0.483726,1.0,3.6,3.6,...,-0.2,-0.2,141.0,0.4,0.4,0.4,0.2,0.2,1.6,1.6
2610,2024-04-17,2023021303,R,NYI,Pittsburgh Penguins,PIT,0.52016,1.0,4.2,4.2,...,-0.2,-0.2,241.4,0.5,0.6,0.6,0.4,0.4,2.4,2.4
2588,2024-04-16,2023021295,H,OTT,Boston Bruins,BOS,0.566948,1.0,3.0,3.0,...,0.0,0.0,227.6,0.4,0.2,0.2,0.2,0.2,2.0,2.0
2576,2024-04-15,2023021290,R,TBL,Buffalo Sabres,BUF,0.413718,1.0,2.6,2.6,...,0.0,0.0,232.4,0.5,0.2,0.2,0.2,0.2,1.8,1.8
2603,2024-04-16,2023021298,H,DET,Montréal Canadiens,MTL,0.49738,1.0,3.8,3.8,...,-0.4,-0.4,289.8,0.6,0.6,0.6,0.2,0.2,2.6,2.6
2601,2024-04-16,2023021295,R,BOS,Ottawa Senators,OTT,0.460883,1.0,2.6,2.6,...,-0.4,-0.4,365.4,0.6,0.6,0.6,0.2,0.2,3.6,3.6
2608,2024-04-17,2023021304,R,TBL,Toronto Maple Leafs,TOR,0.455497,1.0,4.8,4.8,...,-1.4,-1.4,354.4,0.3,1.4,1.4,0.0,0.0,3.8,3.8


In [12]:
teams = nhl_teams.keys()

for i in teams:
    if i not in team_df['teamFullName'].to_list():
        print(i)

In [13]:
len(team_df['opponentTeamAbbrev'].unique())

26

### Group player stats, team stats, and opposing team stats into one dataframe for prediction

In [None]:
print('team: ', len(team_df))
team_df = pd.merge(team_df, team_df, left_on=['teamAbbrev', 'gameId'], right_on=['opponentTeamAbbrev', 'gameId'], suffixes=('', '_opponent'))

print('player: ', len(player_df))
print('team: ', len(team_df))

df = pd.merge(team_df, player_df, on=['teamAbbrev', 'gameId'])
df = df.loc[:, ~df.columns.str.endswith('_x')]
df = df.apply(lambda x: x.replace('_y', '') if x.name.endswith('_y') else x)

for i in df.columns:
    if i.endswith('_y'):
        df.rename(columns={i: i.replace('_y', '')}, inplace=True)

df = df.fillna(0)
df = df.drop_duplicates()
df

In [None]:
teams = nhl_teams.keys()

for i in teams:
    if i not in team_df['teamFullName'].to_list():
        print(i)

In [None]:
df['skaterFullName'].sample()

### Load betable players for prediction & match them with averaged stats dataframe

In [None]:
bum_df = pd.read_csv('../../../lib/ai_bum_list.csv') 
bum_df.head()

In [None]:
bum_df[bum_df['scored'] > 0]

In [None]:
df

In [None]:
i = 10

# df[df['skaterFullName'] == bum_df.iloc[i]['skaterFullName']]
df['skaterFullName'] == 'Jamie Benn'

In [None]:
# pred_df = df[df['skaterFullName'].isin(bum_df['skaterFullName'])]

pred_list = list()

for i in range(len(bum_df)):

    last_10 = df[df['skaterFullName'] == bum_df.iloc[i]['skaterFullName']]
    print(len(last_10))
    break

In [None]:
pred_df['ppPoint_scored'] = pred_df['ppPoints'].map(lambda x: 1 if x >= 1 else 0)

In [None]:
pred_df.loc[pred_df['ppPoint_scored'] == 1]['skaterFullName']

### Prepare the pred_df for prediction

In [None]:
to_remove = ['gameId', 'penaltyKillNetPct', 'powerPlayNetPct', 'regulationAndOtWins', 'teamFullName', 'teamId', 'ties', 'winsInShootout', 
             'ppOpportunitiesPerGame', 'shGoalsAgainstPerGame', 'netPenalties', 'netPenaltiesPer60', 'pkNetGoalsPerGame', 'opponentTeamAbbrev_opponent',
             'penaltyKillNetPct_opponent', 'regulationAndOtWins_opponent', 'teamFullName_opponent', 'teamId_opponent', 'ties_opponent', 'winsInShootout_opponent',
             'ppNetGoalsPerGame_opponent', 'ppOpportunitiesPerGame_opponent', 'shGoalsAgainstPerGame_opponent', 'netPenaltiesPer60_opponent',
             'netPenalties_opponent', 'pkNetGoalsPerGame_opponent', 'ppGoalsAgainstPerGame_opponent', 'shGoalsForPerGame_opponent', 'timesShorthandedPerGame_opponent',
             'teamAbbrev_opponent', 'lastName', 'pointsPerGame', 'shootsCatches', 'skaterFullName', 'opponentTeamAbbrev', 
             'ppGoalsForPer60', 'ppIndividualSatForPer60', 'ppPointsPer60', 'ppPrimaryAssistsPer60', 'ppSecondaryAssistsPer60', 'ppShotsPer60', 'ppTimeOnIcePctPerGame',
             'ppTimeOnIcePerGame', 'goalsForPerGame', 'teamAbbrev', 'gameDate_opponent', 'powerPlayGoalsFor', 'ppGoals', 'gameDate', 'playerId'
            ]

to_keep = ['penaltyKillPct', 'powerPlayPct', 'powerPlayGoalsFor', 'ppGoalsPerGame', 'ppOpportunitiesPerGame', 'penaltyKillPct_opponent', 'netPenaltiesPer60_opponent', 'assists',
            'evPoints', 'gamesPlayed', 'ppPoints', 'ppTimeOnIcePerGame', 'ppPoint_scored']

# all_cols = pred_df.columns.to_list()
# X_cols = list(set(all_cols) - set(to_remove))
# input_df = pred_df[X_cols]

input_df = pred_df[to_keep]
input_df

In [None]:
one_hot_encoded = pd.get_dummies(pred_df['positionCode'], prefix='positionCode')

input_df = pd.concat([input_df, one_hot_encoded], axis=1)
input_df = pred_df.drop('positionCode', axis=1)

one_hot_encoded = pd.get_dummies(input_df['homeRoad'], prefix='homeRoad')

input_df = pd.concat([input_df, one_hot_encoded], axis=1)
input_df = pred_df.drop('homeRoad', axis=1)

one_hot_encoded = pd.get_dummies(input_df['homeRoad_opponent'], prefix='homeRoad_opponent')

input_df = pd.concat([input_df, one_hot_encoded], axis=1)
input_df = input_df.drop('homeRoad_opponent', axis=1)

input_df = input_df[to_keep]
input_df

In [None]:
# Re-index to match the model input
X_train_cols = to_keep
input_df = input_df.reindex(X_train_cols, axis=1)
# (input_df['ppPoint_scored'] > 0).sum()

row = input_df.loc[input_df['ppPoint_scored'] > 0]
row

In [None]:
# input_df = scaler.transform(input_df)

### Prediction

In [None]:
T = 10
X_pred = []
Y_pred = []

### Reshaping The Data for LSTM
for t in range(len(input_df) - T):

  ### FIND ALL INSTANCES OF PLAYER IN PLAYER DB, THEN TAKE LAST T ROWS OF DATA
  last_10_games = player_database[player_database['skaterFullName'] == input_df.iloc[t]['skaterFullName'].values[0]].tail(T)
  print(last_10_games)
  break
  # Take all rows except the last one (label)
  x = input_df.iloc[t:t + T]
  x = x.drop('ppPoint_scored', axis=1)
  X_pred.append(x.values)

  # Take only the label
  y = input_df.iloc[t + T]['ppPoint_scored']
  Y_pred.append(y)

X_pred = np.array(X_pred)
y_pred = np.array(Y_pred)

print("X_pred.shape", X_pred.shape, "y_pred.shape", y_pred.shape)

In [None]:
# input_df = input_df.reshape(input_df.shape[0], T, input_df.shape[1])

y_preds = model.predict(X_pred)

In [None]:
from sklearn.metrics import classification_report

# y_pred_probs = model.predict(X_test)

# # Adjust the threshold
# threshold = 0.9 # You can experiment with different threshold values
# y_pred_labels = (y_pred_probs > threshold).astype(int)

# print(f'threshold of {threshold*100}%')
# print(classification_report(y_test, y_pred_labels))

# y_pred = model.predict(pred) 
y_preds_res = np.round(y_pred).astype(int)
print('Regular threshold')
print(classification_report(y_pred, y_preds_res))

In [None]:
# input_df = input_df.reshape(input_df.shape[0], input_df.shape[2])
# input_df = scaler.inverse_transform(input_df)
# input_df = pd.DataFrame(input_df, columns=X_train_cols)
# pred_df['predictions'] = preds