In [1]:
# Import the shot data.
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score

DATA_FOLDER = 'data/'
DATAFRAME_FILE = DATA_FOLDER + 'processed_data.pkl'

current_dir = Path.cwd()
frame_path = current_dir.joinpath(DATAFRAME_FILE)
shots_df = pd.read_pickle(str(frame_path))

In [2]:
shots_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118280 entries, 0 to 118279
Data columns (total 27 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   period               118280 non-null  int64  
 1   seconds_remaining    118280 non-null  int64  
 2   att_score            118280 non-null  int32  
 3   def_score            118280 non-null  int32  
 4   lead_size            118280 non-null  int32  
 5   att_players          118268 non-null  Int64  
 6   att_skaters          118268 non-null  Int64  
 7   att_forwards         118268 non-null  Int64  
 8   def_players          118268 non-null  Int64  
 9   def_skaters          118268 non-null  Int64  
 10  def_forwards         118268 non-null  Int64  
 11  event_coord_x        118277 non-null  float64
 12  event_coord_y        118278 non-null  float64
 13  is_playoff_game      118280 non-null  int32  
 14  is_home              118280 non-null  int32  
 15  is_overtime      

In [3]:
pd.set_option('display.max_columns', None)
shots_df.head()

Unnamed: 0,period,seconds_remaining,att_score,def_score,lead_size,att_players,att_skaters,att_forwards,def_players,def_skaters,def_forwards,event_coord_x,event_coord_y,is_playoff_game,is_home,is_overtime,is_extra_attacker,is_empty_net,is_rebound,is_goal,shot_type_backhand,shot_type_deflected,shot_type_slap,shot_type_snap,shot_type_tip,shot_type_wrap,shot_type_wrist
0,1,1171,0,0,0,6,5,3,6,5,3,78.0,-19.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,1,1151,0,0,0,6,5,3,6,5,3,37.0,10.0,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,1,1140,0,0,0,6,5,3,6,5,3,47.0,-23.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,1,971,0,0,0,6,5,3,6,5,3,73.0,22.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,1,966,0,0,0,6,5,3,6,5,3,53.0,14.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [4]:
shots_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
period,118280.0,2.023174,0.838962,1.0,1.0,2.0,3.0,5.0
seconds_remaining,118280.0,593.05153,347.601714,0.0,288.0,596.0,896.0,1200.0
att_score,118280.0,1.30875,1.369216,0.0,0.0,1.0,2.0,9.0
def_score,118280.0,1.425042,1.434907,0.0,0.0,1.0,2.0,9.0
lead_size,118280.0,-0.116292,1.601392,-8.0,-1.0,0.0,1.0,8.0
att_players,118268.0,5.939527,0.278203,4.0,6.0,6.0,6.0,6.0
att_skaters,118268.0,4.95971,0.313196,3.0,5.0,5.0,5.0,6.0
att_forwards,118268.0,3.084173,0.442879,1.0,3.0,3.0,3.0,6.0
def_players,118268.0,5.826259,0.418524,4.0,6.0,6.0,6.0,6.0
def_skaters,118268.0,4.832271,0.427537,3.0,5.0,5.0,5.0,6.0


In [5]:
y = shots_df['is_goal'].astype(int)
X = shots_df.drop('is_goal', axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=shots_df['is_goal'], random_state=42)

In [7]:
def goal_prob(y):
    return sum(y)/len(y)
print(f"Probability of goal in sample: {goal_prob(y):.3f}")

Probability of goal in sample: 0.068


In [22]:
from sklearn import linear_model
pipe = make_pipeline(
    SimpleImputer(missing_values=np.nan, strategy='median'), 
    StandardScaler(),
    #linear_model.LogisticRegression(class_weight='balanced')
    linear_model.LogisticRegression()
)

In [23]:
pipe.fit(X_train, y_train);
y_prob_pred = pipe.predict_proba(X_test)
y_pred = pipe.predict(X_test)

print('Mean squared error: %.3f'
      % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.3f'
      % r2_score(y_test, y_pred))

print('Mean squared error: %.3f'
      % mean_squared_error(y_test, y_prob_pred[:,1]))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.3f'
      % r2_score(y_test, y_prob_pred[:,1]))
pipe.score(X_test, y_test)

Mean squared error: 0.067
Coefficient of determination: -0.053
Mean squared error: 0.060
Coefficient of determination: 0.051


0.933251606357795

In [16]:
y_pred

array([[0.49613991, 0.50386009],
       [0.41878286, 0.58121714],
       [0.76185708, 0.23814292],
       ...,
       [0.62718667, 0.37281333],
       [0.30462694, 0.69537306],
       [0.71834305, 0.28165695]])

In [17]:
y_pred[:,1]

array([0.50386009, 0.58121714, 0.23814292, ..., 0.37281333, 0.69537306,
       0.28165695])

In [12]:
sum(y_pred)

array([22078.02090291,  1577.97909709])

In [13]:
np.mean(y_pred)

0.5