In [1]:
import numpy as np
import pandas as pd
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt


## Advanced feature selections

Here we tried 2 selection methods:
1. Random Forest classifier
2. Feature selection based on correlation

### 1. Random Forest classifier

In [2]:
# load data filtering out missing values
data = pd.read_csv('advanced_models_data.csv')
data.dropna(inplace=True)
print(data.isna().sum())

Unnamed: 0           0
ShotType             0
Period               0
GameTime             0
XCoord               0
YCoord               0
isEmptyNet           0
isGoal               0
DistanceToGoal       0
ShootingAngle        0
LastEvent            0
LastEvent_XCoord     0
LastEvent_YCoord     0
TimeLastEvent        0
DistanceLastEvent    0
Rebound              0
AngleChange          0
Speed                0
dtype: int64


In [3]:
# setting the seed variable so it can be used throughout the notebook
seed = 42

In [4]:
# print out the cateforical features so we know which columns to encode
categorical = [var for var in data.columns if data[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :\n\n', categorical)

There are 3 categorical variables

The categorical variables are :

 ['ShotType', 'GameTime', 'LastEvent']


In [5]:
# splitting data 
X = data.drop('isGoal', axis=1)
y = data['isGoal']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=seed)

In [6]:
# encode categorical variables with one-hot encoding
encoder = ce.OneHotEncoder(cols=['ShotType', 'GameTime', 'LastEvent'])
X_train = encoder.fit_transform(X_train)
X_val = encoder.transform(X_val)

In [7]:
# scaling the data
cols = X_train.columns
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [8]:
# check to make sure there are no missing values, otherwise RnadomForestClassifier will not work
np.isnan(X_train).any()

False

In [9]:
# Random Forest Classifier
rfc = RandomForestClassifier(random_state=seed)
# fit the model
rfc.fit(X_train, y_train)
# Predict the Test set results
y_pred = rfc.predict(X_val)
# Check accuracy score
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_val, y_pred)))

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model accuracy score: 0.9973


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [10]:
# print out the feature scores
X_train = pd.DataFrame(X_train, columns=[cols])
feature_scores = pd.Series(rfc.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_scores

TimeLastEvent        1.739083e-01
Speed                1.652295e-01
DistanceLastEvent    1.534609e-01
LastEvent_XCoord     1.381071e-01
LastEvent_YCoord     1.350657e-01
                         ...     
GameTime_1185        3.711502e-06
GameTime_1198        1.396072e-06
GameTime_1199        6.446713e-07
GameTime_1197        5.908130e-07
GameTime_1200        4.046445e-08
Length: 1231, dtype: float64

In [11]:
# get the top 7 features
feature_scores[0:7]

TimeLastEvent        0.173908
Speed                0.165229
DistanceLastEvent    0.153461
LastEvent_XCoord     0.138107
LastEvent_YCoord     0.135066
LastEvent_2          0.060953
DistanceToGoal       0.029464
dtype: float64

### Correlation matrix

In [12]:
# preprocessing the data
data = pd.read_csv('advanced_models_data.csv')
encoder = ce.OneHotEncoder(cols=['ShotType', 'LastEvent'])
data_corr = encoder.fit_transform(data.drop('GameTime', axis=1))
data_corr

Unnamed: 0.1,Unnamed: 0,ShotType_1,ShotType_2,ShotType_3,ShotType_4,ShotType_5,ShotType_6,ShotType_7,ShotType_8,Period,...,LastEvent_7,LastEvent_8,LastEvent_9,LastEvent_XCoord,LastEvent_YCoord,TimeLastEvent,DistanceLastEvent,Rebound,AngleChange,Speed
0,3,1,0,0,0,0,0,0,0,1,...,0,0,0,94.0,-34.0,11.0,154.0,False,0.0,14.000000
1,5,0,1,0,0,0,0,0,0,1,...,0,0,0,-37.0,-28.0,7.0,50.0,False,0.0,7.142857
2,6,0,1,0,0,0,0,0,0,1,...,0,0,0,-79.0,0.0,1.0,4.0,True,86.0,4.000000
3,9,1,0,0,0,0,0,0,0,1,...,0,0,0,-91.0,35.0,18.0,80.0,False,0.0,4.444444
4,15,0,0,1,0,0,0,0,0,1,...,0,0,0,-69.0,-22.0,5.0,22.0,False,0.0,4.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386779,1623549,1,0,0,0,0,0,0,0,3,...,0,0,0,-61.0,0.0,35.0,28.0,True,70.0,0.800000
386780,1623552,1,0,0,0,0,0,0,0,3,...,0,0,0,-68.0,38.0,16.0,35.0,False,0.0,2.187500
386781,1623553,0,0,0,0,1,0,0,0,3,...,0,0,0,-37.0,22.0,8.0,24.0,True,0.0,3.000000
386782,1623555,1,0,0,0,0,0,0,0,3,...,0,0,0,0.0,0.0,0.0,71.0,False,0.0,0.000000


In [13]:
# constructing the matrix
data_corr.corr().style.format("{:.4}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

Unnamed: 0.1,Unnamed: 0,ShotType_1,ShotType_2,ShotType_3,ShotType_4,ShotType_5,ShotType_6,ShotType_7,ShotType_8,Period,XCoord,YCoord,isEmptyNet,isGoal,DistanceToGoal,ShootingAngle,LastEvent_1,LastEvent_2,LastEvent_3,LastEvent_4,LastEvent_5,LastEvent_6,LastEvent_7,LastEvent_8,LastEvent_9,LastEvent_XCoord,LastEvent_YCoord,TimeLastEvent,DistanceLastEvent,Rebound,AngleChange,Speed
Unnamed: 0,1.0,0.03431,-0.02136,0.01117,-0.001618,-0.03787,0.02262,-0.00512,9.163e-05,-0.005912,-0.003064,0.005894,0.002531,0.006883,-0.08497,0.003715,-0.01073,0.01704,0.007208,-0.004937,0.0121,-0.00536,-0.004043,-0.008308,0.00544,-0.003016,0.001991,-0.005784,-0.005649,0.007655,0.002184,-0.0003754
ShotType_1,0.03431,1.0,-0.4429,-0.2478,-0.3184,-0.471,-0.1377,-0.1091,-0.01184,0.007507,0.002116,-0.02611,0.03504,-0.01454,-0.001462,0.01282,0.009098,0.002118,0.01154,-0.01765,0.006364,-0.004997,-0.02269,0.009144,0.01104,0.001998,-0.005981,0.003034,0.02238,0.01122,0.001314,0.02801
ShotType_2,-0.02136,-0.4429,1.0,-0.0951,-0.1222,-0.1807,-0.05283,-0.04187,-0.004545,-0.009594,-0.0009259,0.01373,-0.01498,0.003725,0.04418,0.003145,0.004539,0.007429,-0.02189,0.01177,0.01202,4.536e-05,-0.00232,-0.008496,-0.005744,-0.001934,0.005263,0.0003209,0.002933,-0.02173,-0.01352,-0.009582
ShotType_3,0.01117,-0.2478,-0.0951,1.0,-0.06837,-0.1011,-0.02956,-0.02343,-0.002543,-0.008699,-0.001109,0.003341,-0.01233,0.06518,-0.1412,-0.005444,-0.00756,-0.005378,-0.006365,0.0283,-0.01353,9.988e-06,0.001271,-0.01061,-0.006095,-0.000973,-0.0007128,-0.0105,-0.01171,-0.006166,-0.00115,-0.03969
ShotType_4,-0.001618,-0.3184,-0.1222,-0.06837,1.0,-0.1299,-0.03798,-0.0301,-0.003267,0.01082,-1.702e-06,0.0003682,0.002694,0.02074,-0.154,-0.02455,-0.003567,-0.002717,0.03002,-0.01349,0.007559,-0.008118,-0.01584,-0.001522,0.005883,0.0001925,-0.0002819,0.01627,0.04265,0.02904,0.0154,0.02556
ShotType_5,-0.03787,-0.471,-0.1807,-0.1011,-0.1299,1.0,-0.05618,-0.04452,-0.004833,-0.001459,-0.00115,0.01856,-0.02485,-0.04746,0.203,0.002763,-0.00909,-0.001935,-0.01211,0.001185,-0.01645,0.01176,0.0478,0.003824,-0.0104,-0.0008519,0.004468,-0.006985,-0.05625,-0.01125,0.002389,-0.01628
ShotType_6,0.02262,-0.1377,-0.05283,-0.02956,-0.03798,-0.05618,1.0,-0.01301,-0.001413,-0.008931,0.002798,0.001205,-0.008171,0.03486,-0.06679,0.0005272,-0.007068,-0.005946,-0.005791,0.01956,-0.004336,-0.002491,-0.002297,-0.003777,0.002378,0.000432,-0.002307,-0.006409,-0.004016,-0.005628,-0.0053,-0.02047
ShotType_7,-0.00512,-0.1091,-0.04187,-0.02343,-0.0301,-0.04452,-0.01301,1.0,-0.001119,0.001654,-0.004072,0.003991,-0.004686,-0.008872,-0.08571,-0.007307,0.007579,-0.002868,0.003904,-0.008623,0.001198,0.007087,-0.01088,0.002512,-0.00264,0.001016,-6.255e-05,-0.003552,-0.0009872,0.003864,-0.0001166,-0.002799
ShotType_8,9.163e-05,-0.01184,-0.004545,-0.002543,-0.003267,-0.004833,-0.001413,-0.001119,1.0,0.008225,-0.0005897,-0.002692,0.03711,-0.003542,0.0004207,0.0003369,-0.003382,-0.0005317,-0.003147,0.008941,-0.002648,-0.003258,-0.002062,-0.0009812,0.00377,0.0002013,0.001434,-0.002482,0.003606,-0.003742,-0.002427,-0.001433
Period,-0.005912,0.007507,-0.009594,-0.008699,0.01082,-0.001459,-0.008931,0.001654,0.008225,1.0,0.0007331,0.001902,0.07627,0.04711,-0.00422,0.002088,-0.04844,-0.02112,0.01989,0.01652,-0.002585,0.02226,-0.009937,-0.0007266,0.03279,0.005119,0.001095,0.1241,0.01754,0.01101,0.002397,-0.04395
