### Notebook Setup

In [8]:
# Import libraries
import pandas as pd
import numpy as np
import os.path as path
from sklearn import preprocessing
import warnings
import math
warnings.simplefilter(action = 'ignore', category = FutureWarning)
warnings.simplefilter(action = 'ignore', category = UserWarning)

In [9]:
# Settings to allow all columns in dataframe to display
desired_width = 320
pd.set_option('display.width', desired_width)
np.set_printoptions(linewidth=desired_width)
pd.set_option('display.max_columns', 222)

In [10]:
# Load multiple datasets
skaters1213 = pd.read_csv('/Users/nathananderson/Documents/Data_Science/NHL_Player_Analysis/NHL_Player_Analysis/Datasets/nhl_player_data/skaters1213.csv')
skaters1314 = pd.read_csv('/Users/nathananderson/Documents/Data_Science/NHL Players/Datasets/nhl_player_data/skaters1314.csv')
skaters1415 = pd.read_csv('/Users/nathananderson/Documents/Data_Science/NHL Players/Datasets/nhl_player_data/skaters1415.csv')
skaters1516 = pd.read_csv('/Users/nathananderson/Documents/Data_Science/NHL Players/Datasets/nhl_player_data/skaters1516.csv')
skaters1617 = pd.read_csv('/Users/nathananderson/Documents/Data_Science/NHL Players/Datasets/nhl_player_data/skaters1617.csv')
skaters1718 = pd.read_csv('/Users/nathananderson/Documents/Data_Science/NHL Players/Datasets/nhl_player_data/skaters1718.csv')
skaters1819 = pd.read_csv('/Users/nathananderson/Documents/Data_Science/NHL Players/Datasets/nhl_player_data/skaters1819.csv')
skaters1920 = pd.read_csv('/Users/nathananderson/Documents/Data_Science/NHL Players/Datasets/nhl_player_data/skaters1920.csv')
skaters2021 = pd.read_csv('/Users/nathananderson/Documents/Data_Science/NHL Players/Datasets/nhl_player_data/skaters2021.csv')
skaters2122 = pd.read_csv('/Users/nathananderson/Documents/Data_Science/NHL Players/Datasets/nhl_player_data/skaters2122.csv')
#skaters2223 = pd.read_csv('/Users/nathananderson/Documents/Data_Science/NHL Players/Datasets/nhl_player_data/skaters2223.csv')

filenames = [skaters1213, skaters1314, skaters1415, skaters1516, skaters1617, skaters1718, skaters1819, skaters1920, skaters2021]

df = pd.concat(f for f in filenames)

print('The shape of the data is:', df.shape)
df.tail()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/nathananderson/Documents/Data_Science/NHL Players/Datasets/nhl_player_data/skaters1314.csv'

### Feature Engineering

In [None]:
df['Last_Season_Played'] = df.groupby(["playerId"])['season'].transform(np.max)

: 

In [None]:
df.tail()

: 

### Slicing DataFrame

In [None]:
# Slicing the data
df = df[df['situation'] == 'all']
df = df[df['I_F_points'] >= 10]
df = df[df['Last_Season'] == df['season'].max()]
df.head()

: 

In [None]:
l_season = df['season'].max()

: 

In [None]:
l_season

: 

### Player ID DataFrame

In [None]:
# Player Name & Player ID DataFrame
players = df[['name', 'playerId','Last_Season']].copy()
players.head()


: 

In [None]:
# Removing duplicates
players = players.drop_duplicates(subset=['playerId'], keep='first')
players = players.sort_values('playerId').reset_index(drop = True)
players.head()

: 

### ML DataFrame

In [None]:
# DataFrame for ML
player_data = df[['playerId', 'games_played', 'I_F_points',  'I_F_goals',
                  'I_F_shotsOnGoal', 'I_F_penalityMinutes', 
                  'icetime', 'position'
                  ]].copy()

"""'onIce_xGoalsPercentage', 'offIce_xGoalsPercentage', 'onIce_corsiPercentage', 'offIce_corsiPercentage', 'icetime', 
'onIce_fenwickPercentage','offIce_fenwickPercentage', 'iceTimeRank'"""

player_data.head()

: 

In [None]:
# Instantiate MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

: 

In [None]:
# Normalizing 'ice time'
player_data[['icetime']] = scaler.fit_transform(player_data[['icetime']])

: 

In [None]:
# Instantiate LabelEncoder
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

# Label Encode 'position'
player_data[['position']] = le.fit_transform(player_data[['position']])

: 

In [None]:
# Print 'position' classes
le.classes_

: 

In [None]:
player_data.head()

: 

In [None]:
# Calculating metrics per game
player_data["% Games Played"] = player_data['games_played'] / 82
player_data["Points per Game"] = player_data['I_F_points'] / player_data['games_played']
player_data["Goals per Game"] = player_data['I_F_goals'] / player_data['games_played']
player_data["Shots on Goal per Game"] = player_data['I_F_shotsOnGoal'] / player_data['games_played']
#player_data["Primary Assists per Game"] = player_data['I_F_primaryAssists'] / player_data['games_played']
#player_data["Secondary Assists per Game"] = player_data['I_F_secondaryAssists'] / player_data['games_played']
player_data.head()

: 

In [None]:
scaler = MinMaxScaler()

player_data[['Shots on Goal per Game']] = scaler.fit_transform(player_data[['Shots on Goal per Game']])

: 

In [None]:
player_data.head()

: 

In [None]:
# Plots
import matplotlib.pyplot as plt
plt.scatter(player_data['games_played'], player_data['I_F_points'])
plt.scatter(player_data['games_played'], player_data['I_F_goals'])

plt.legend(['Points', 'Goals'])
plt.title('Points & Goals by Games Played')
plt.show()

: 

In [None]:
# DF for Machine Learning
player_data = player_data[['playerId', 'Points per Game', '% Games Played', 'Goals per Game', 'Shots on Goal per Game',
                            'icetime', 'position']]
player_data.head()

: 

### Machine Learning

In [None]:
# Separate features and labels for models
features = player_data.drop(labels = "Points per Game", axis = 1)
labels = player_data["Points per Game"]

: 

In [None]:
features.head()

: 

In [None]:
features.shape

: 

In [None]:
labels.head()

: 

In [None]:
labels.shape

: 

In [None]:
# Splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 100)

: 

### Linear Regression

In [None]:
# Train a LinearRegression model
from sklearn.metrics import log_loss
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
#lm.fit(X_train.values,y_train)
lm.fit(X_train, y_train)
print("** Linear Regression model has been trained.")
    
# Get the train and test accuracy scores
print(f"** Training Score: {lm.score(X_train, y_train)}")
print(f"** Testing Score: {lm.score(X_test, y_test)}")

# Get the train and test logloss results
#print(f"** Training LogLoss: {log_loss(y_train, lm.predict(X_train))}")
#print(f"** Testing LogLoss: {log_loss(y_test, lm.predict_proba(X_test))}")

: 

In [None]:
name = "Mark Scheifele"

fn = df[df['name'].str.contains(name, case = False)]
fn.head(15)

: 

In [None]:
# Predict
pred = lm.predict([[8476460, .70, 0.35, .5, 0.7, 2]])
print(name + " will average:", float(np.round(pred, 4)), "points per game.")

: 

### Neural Network

In [None]:
import tensorflow as tf
np.random.seed(0)
tf.random.set_seed(0)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Activation, Dropout, Dense, Reshape, LSTM, GRU

: 

In [None]:
# Separate features and labels for models
features = player_data.drop(labels = "Points per Game", axis = 1)
labels = player_data["Points per Game"]

: 

In [None]:
features.head()

: 

In [None]:
features.shape

: 

In [None]:
#features = features.values.reshape(-1, 6, 1)

: 

In [None]:
features.shape

: 

In [None]:
labels.head()

: 

In [None]:
labels.shape

: 

#### Stacked NN

In [None]:
rnn = Sequential()

: 

In [None]:
rnn.add(Reshape((6, 1), input_shape = (6,)))

: 

In [None]:
rnn.add(LSTM(100, input_shape = (20, 1), return_sequences= True))
rnn.add(LSTM(200, return_sequences= True))
rnn.add(GRU(50))
rnn.add(Dense(1)) 
rnn.add(Activation('linear'))

: 

In [None]:
rnn.compile(loss = 'mean_squared_error', optimizer = 'adam')

: 

In [None]:
rnn.summary()

: 

In [None]:
#from tabnanny import verbose

rnn.fit(X_train, y_train, epochs = 25, batch_size = 10, verbose = 1, validation_split= .2)

: 

In [None]:
def get_model_perf(model_obj):
    """Get RMSE of a model for training and testing datasets"""

    score_train = model_obj.evaluate(X_train, y_train, verbose = 0)
    print('Training RMSE: %.2f RMSE' % (math.sqrt(score_train)))

    score_test = model_obj.evaluate(X_test, y_test, verbose = 0)
    print('Testing RMSE: %.2f RMSE' % (math.sqrt(score_test)))

: 

In [None]:
get_model_perf(rnn)

: 