# Feature Engineering

In [11]:
import pandas as pd

game_df = pd.read_csv("game.csv")
plays_df = pd.read_csv("game_plays.csv")

plays_df = plays_df.drop(['secondaryType', 'periodType', 
                 'dateTime', 'rink_side'], axis=1).fillna(0)


In [12]:
import featuretools as ft
from featuretools import Feature 

es = ft.EntitySet(id="plays")
es = es.entity_from_dataframe(entity_id="plays", dataframe=plays_df, index="play_id",
           variable_types = { "event": ft.variable_types.Categorical, 
                             "description": ft.variable_types.Categorical })       

f1 = Feature(es["plays"]["event"])
f2 = Feature(es["plays"]["description"])

encoded, defs = ft.encode_features(plays_df, [f1, f2], top_n=10)
encoded.reset_index(inplace=True)
encoded.head()

Unnamed: 0,index,event = Faceoff,event = Shot,event = Hit,event = Stoppage,event = Blocked Shot,event = Missed Shot,event = Giveaway,event = Takeaway,event = Penalty,...,team_id_against,x,y,period,periodTime,periodTimeRemaining,goals_away,goals_home,st_x,st_y
0,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1,0,1200,0,0,0.0,0.0
1,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1,0,1200,0,0,0.0,0.0
2,2,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,1,0,1200,0,0,0.0,0.0
3,3,1,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,1,0,1200,0,0,0.0,0.0
4,4,0,0,0,0,0,0,1,0,0,...,1.0,28.0,24.0,1,21,1179,0,0,-28.0,-24.0


In [13]:
es = ft.EntitySet(id="plays")
es = es.entity_from_dataframe(entity_id="plays", dataframe=encoded, index="play_id")
es = es.normalize_entity(base_entity_id="plays",new_entity_id="games", index="game_id")

features,transform=ft.dfs(entityset=es,target_entity="games",max_depth=2)
features.reset_index(inplace=True)
features.head()


Unnamed: 0,game_id,SUM(plays.event = Blocked Shot),SUM(plays.description = Puck Frozen),SUM(plays.event = Penalty),SUM(plays.description = Icing),SUM(plays.event = Faceoff),SUM(plays.periodTimeRemaining),SUM(plays.event = Giveaway),SUM(plays.description = Period Ready),SUM(plays.st_y),...,MEAN(plays.goals_home),MEAN(plays.description = Period Start),MEAN(plays.team_id_for),MEAN(plays.description is unknown),MEAN(plays.periodTime),MEAN(plays.x),MEAN(plays.event = Hit),MEAN(plays.event = Stoppage),MEAN(plays.event = Takeaway),COUNT(plays)
0,2011030221,30,1,9,11,69,229547,19,4,-313.0,...,1.573446,0.011299,2.090395,0.838983,551.562147,2.293785,0.163842,0.146893,0.031073,354
1,2011030111,32,1,11,14,67,226674,11,3,462.0,...,1.869806,0.00831,4.828255,0.836565,572.094183,1.049861,0.204986,0.149584,0.019391,361
2,2011030222,35,3,14,10,53,191462,17,3,394.0,...,0.923077,0.009615,1.996795,0.86859,586.339744,-6.862179,0.179487,0.121795,0.041667,312
3,2011030223,27,2,7,5,65,202598,13,4,-86.0,...,1.959375,0.0125,1.86875,0.821875,566.88125,11.0625,0.18125,0.153125,0.015625,320
4,2011030224,16,0,7,8,64,176605,25,3,-605.0,...,1.811075,0.009772,2.061889,0.81759,624.739414,1.319218,0.136808,0.162866,0.026059,307


In [None]:
import framequery as fq

# assign labels to the generated features
features = fq.execute("""
  SELECT f.*, case when g.type = 'P' then 1 else 0 end as label
  FROM features f 
  JOIN game_df g
    on f.game_id = g.game_id
""")

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# create inputs for sklearn
y = features['label']
X = features.drop(['label', 'game_id'], axis=1).fillna(0)

# train a classifier 
lr = LogisticRegression()
model = lr.fit(X, y)

# Results
print("Accuracy: " + str(model.score(X, y)))
print("ROC: " + str(roc_auc_score(y, model.predict_proba(X)[:, 1] )))



Accuracy: 0.9472607612949129
ROC: 0.9233913450657509
