In [1]:
import numpy as np
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


## Advanced feature selections

Here we tried 2 selection methods:
1. Random Forest classifier
2. Feature selection based on correlation

### 1. Random Forest classifier

In [2]:
# load data filtering out missing values
data = pd.read_csv('advanced_models_data.csv')
data.dropna(inplace=True)
print(data.isna().sum())

Unnamed: 0           0
GameID               0
ShotType             0
Period               0
GameTime             0
XCoord               0
YCoord               0
isEmptyNet           0
isGoal               0
DistanceToGoal       0
ShootingAngle        0
LastEvent            0
LastEvent_XCoord     0
LastEvent_YCoord     0
TimeLastEvent        0
DistanceLastEvent    0
Rebound              0
AngleChange          0
Speed                0
dtype: int64


In [3]:
# setting the seed variable so it can be used throughout the notebook
seed = 42

In [4]:
# print out the cateforical features so we know which columns to encode
categorical = [var for var in data.columns if data[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :\n\n', categorical)

There are 2 categorical variables

The categorical variables are :

 ['ShotType', 'LastEvent']


In [5]:
# splitting data 
X = data.drop('isGoal', axis=1)
y = data['isGoal']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=seed)

In [6]:
# encode categorical variables with one-hot encoding
encoder = ce.OneHotEncoder(cols=['ShotType', 'LastEvent'], use_cat_names=True)
X_train = encoder.fit_transform(X_train)
X_val = encoder.transform(X_val)

In [7]:
# scaling the data
cols = X_train.columns
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [8]:
# check to make sure there are no missing values, otherwise RnadomForestClassifier will not work
np.isnan(X_train).any()

False

In [9]:
# Random Forest Classifier
rfc = RandomForestClassifier(random_state=seed)
# fit the model
rfc.fit(X_train, y_train)
# Predict the Test set results
y_pred = rfc.predict(X_val)
# Check accuracy score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model accuracy score: 0.9975


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [10]:
# print out the feature scores
X_train = pd.DataFrame(X_train, columns=[cols])
feature_scores = pd.Series(rfc.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

TimeLastEvent             0.188946
LastEvent_XCoord          0.172898
Speed                     0.154576
LastEvent_YCoord          0.147365
DistanceLastEvent         0.138546
LastEvent_FACEOFF         0.081437
DistanceToGoal            0.026843
XCoord                    0.014228
Period                    0.012378
GameTime                  0.011503
YCoord                    0.009971
isEmptyNet                0.009681
LastEvent_SHOT            0.005615
ShootingAngle             0.003911
Unnamed: 0                0.003618
GameID                    0.003364
Rebound                   0.002831
AngleChange               0.002786
LastEvent_HIT             0.001890
LastEvent_PENALTY         0.001561
LastEvent_BLOCKED_SHOT    0.001331
LastEvent_GIVEAWAY        0.000817
ShotType_Slap Shot        0.000667
ShotType_Wrist Shot       0.000561
LastEvent_MISSED_SHOT     0.000542
LastEvent_TAKEAWAY        0.000499
ShotType_Tip-In           0.000464
ShotType_Snap Shot        0.000398
ShotType_Backhand   

In [11]:
# get the top 7 features
feature_scores[0:7]

TimeLastEvent        0.188946
LastEvent_XCoord     0.172898
Speed                0.154576
LastEvent_YCoord     0.147365
DistanceLastEvent    0.138546
LastEvent_FACEOFF    0.081437
DistanceToGoal       0.026843
dtype: float64

### Correlation matrix

In [14]:
# preprocessing the data
data = pd.read_csv('advanced_models_data.csv')
encoder = ce.OneHotEncoder(cols=['ShotType', 'LastEvent'], use_cat_names=True)
data_corr = encoder.fit_transform(data)
data_corr

Unnamed: 0.1,Unnamed: 0,GameID,ShotType_Wrist Shot,ShotType_Snap Shot,ShotType_Tip-In,ShotType_Backhand,ShotType_Slap Shot,ShotType_Deflected,ShotType_Wrap-around,ShotType_nan,...,LastEvent_PENALTY,LastEvent_BLOCKED_SHOT,LastEvent_GOAL,LastEvent_XCoord,LastEvent_YCoord,TimeLastEvent,DistanceLastEvent,Rebound,AngleChange,Speed
0,3,2015020001,1,0,0,0,0,0,0,0,...,0,0,0,94.0,-34.0,11.0,154.0,False,0.0,14.000000
1,5,2015020001,0,1,0,0,0,0,0,0,...,0,0,0,-37.0,-28.0,7.0,50.0,False,0.0,7.142857
2,6,2015020001,0,1,0,0,0,0,0,0,...,0,0,0,-79.0,0.0,1.0,4.0,True,86.0,4.000000
3,9,2015020001,1,0,0,0,0,0,0,0,...,0,0,0,-91.0,35.0,18.0,80.0,False,0.0,4.444444
4,15,2015020001,0,0,1,0,0,0,0,0,...,0,0,0,-69.0,-22.0,5.0,22.0,False,0.0,4.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330727,1395554,2018030417,0,1,0,0,0,0,0,0,...,0,0,0,0.0,0.0,0.0,77.0,False,0.0,0.000000
330728,1395556,2018030417,0,1,0,0,0,0,0,0,...,0,0,0,-68.0,-29.0,12.0,36.0,False,0.0,3.000000
330729,1395558,2018030417,0,1,0,0,0,0,0,0,...,0,0,0,-69.0,-22.0,0.0,18.0,False,0.0,0.000000
330730,1395560,2018030417,1,0,0,0,0,0,0,0,...,0,0,0,69.0,-22.0,0.0,37.0,False,0.0,0.000000


In [15]:
# constructing the matrix
data_corr.corr().style.format("{:.4}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

Unnamed: 0.1,Unnamed: 0,GameID,ShotType_Wrist Shot,ShotType_Snap Shot,ShotType_Tip-In,ShotType_Backhand,ShotType_Slap Shot,ShotType_Deflected,ShotType_Wrap-around,ShotType_nan,Period,GameTime,XCoord,YCoord,isEmptyNet,isGoal,DistanceToGoal,ShootingAngle,LastEvent_HIT,LastEvent_GIVEAWAY,LastEvent_SHOT,LastEvent_FACEOFF,LastEvent_TAKEAWAY,LastEvent_MISSED_SHOT,LastEvent_PENALTY,LastEvent_BLOCKED_SHOT,LastEvent_GOAL,LastEvent_XCoord,LastEvent_YCoord,TimeLastEvent,DistanceLastEvent,Rebound,AngleChange,Speed
Unnamed: 0,1.0,0.9687,0.02925,-0.01671,0.0107,-0.0007132,-0.03615,0.02244,-0.001799,-0.002076,-0.005542,0.0008758,-0.002045,0.00295,0.001699,0.005342,-0.09872,0.001191,-0.00935,0.01633,0.006515,-0.006601,0.01347,-0.004575,-0.007211,-0.004565,0.003722,-0.003272,0.0006922,-0.009489,-0.003302,0.007056,0.0027,0.004652
GameID,0.9687,1.0,0.02791,-0.01431,0.0104,-0.001392,-0.03559,0.02113,-0.001549,-0.00231,-0.007182,0.00115,-0.00151,0.003165,0.001156,0.006004,-0.1017,0.001263,-0.01353,0.01558,0.008246,-0.004972,0.01331,-0.003953,-0.004789,-0.005788,0.004271,-0.002543,0.001653,-0.008806,-0.004845,0.008709,0.004347,0.003096
ShotType_Wrist Shot,0.02925,0.02791,1.0,-0.4428,-0.2443,-0.316,-0.4717,-0.1341,-0.1095,-0.01126,0.007471,-0.002465,0.001831,-0.02673,0.03422,-0.01309,-0.01027,0.01204,0.009136,0.001187,0.01154,-0.0172,0.007399,-0.005781,-0.02314,0.009681,0.01044,0.00161,-0.007073,-0.006967,0.02372,0.01125,0.001134,0.02738
ShotType_Snap Shot,-0.01671,-0.01431,-0.4428,1.0,-0.0954,-0.1234,-0.1842,-0.05238,-0.04276,-0.004399,-0.008717,-0.01148,-0.0008101,0.01488,-0.01483,0.002441,0.05065,0.003135,0.004777,0.008398,-0.02158,0.01089,0.01146,0.0004372,-0.001245,-0.009661,-0.004793,-0.0016,0.005811,0.004444,0.003476,-0.02146,-0.01311,-0.009677
ShotType_Tip-In,0.0107,0.0104,-0.2443,-0.0954,1.0,-0.06809,-0.1016,-0.0289,-0.02359,-0.002427,-0.008315,0.002554,-0.002092,0.003835,-0.01234,0.06647,-0.1373,-0.005377,-0.008363,-0.006275,-0.00593,0.02901,-0.01425,0.0006171,-0.0006129,-0.008963,-0.00693,-0.00187,-0.001128,-0.008434,-0.01294,-0.005795,-0.0004052,-0.03865
ShotType_Backhand,-0.0007132,-0.001392,-0.316,-0.1234,-0.06809,1.0,-0.1315,-0.03738,-0.03052,-0.003139,0.01119,-0.00585,0.0001821,0.0004855,0.003852,0.02028,-0.1506,-0.02371,-0.004132,-0.002997,0.0299,-0.01271,0.007552,-0.00868,-0.016,-0.001089,0.006212,0.0006942,9.266e-05,-0.01465,0.04109,0.02893,0.01564,0.0241
ShotType_Slap Shot,-0.03615,-0.03559,-0.4717,-0.1842,-0.1016,-0.1315,1.0,-0.0558,-0.04555,-0.004686,-0.002894,0.01605,-0.0006175,0.01781,-0.02485,-0.04844,0.2007,0.003641,-0.009001,-0.0007264,-0.01207,0.000387,-0.01669,0.01227,0.04851,0.002993,-0.01028,-0.0009438,0.005091,0.02288,-0.05612,-0.01118,0.001989,-0.01465
ShotType_Deflected,0.02244,0.02113,-0.1341,-0.05238,-0.0289,-0.03738,-0.0558,1.0,-0.01295,-0.001332,-0.008683,0.0004814,0.002409,0.001096,-0.007853,0.03676,-0.06467,-0.0009384,-0.006896,-0.006078,-0.006952,0.0204,-0.005066,-0.002679,-0.001981,-0.003465,0.003028,0.0006783,-0.002098,-0.005558,-0.005275,-0.006784,-0.006462,-0.02165
ShotType_Wrap-around,-0.001799,-0.001549,-0.1095,-0.04276,-0.02359,-0.03052,-0.04555,-0.01295,1.0,-0.001088,0.001811,0.002586,-0.002719,0.003797,-0.004805,-0.009181,-0.08399,-0.007317,0.009008,-0.003357,0.003787,-0.008917,0.001234,0.007713,-0.01192,0.001767,-0.002685,0.002373,0.000425,0.0001505,-0.0005842,0.00377,-4.313e-05,-0.002486
ShotType_nan,-0.002076,-0.00231,-0.01126,-0.004399,-0.002427,-0.003139,-0.004686,-0.001332,-0.001088,1.0,0.008143,0.006757,-0.001371,-0.002021,0.0457,-0.003387,0.0005795,0.0007681,-0.002927,-0.001946,-0.002522,0.009814,-0.002533,-0.003134,-0.001973,-0.001334,0.0007267,0.0005636,0.002095,-0.006107,0.003427,-0.003254,-0.001993,-0.002732
