# Predicting NHL Playoff Games from Event Data


In [50]:
# Manejo de dataframes
import pandas as pd

# Manejo de datos en forma de strings
import hashlib

# FeatureTools
import featuretools as ft
from featuretools import Feature 

# Aprendizaje máquina
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [51]:
# Carga el dataframe de games

game_df = pd.read_csv("game.csv.zip")
game_df.head()

Unnamed: 0,game_id,season,type,date_time,date_time_GMT,away_team_id,home_team_id,away_goals,home_goals,outcome,home_rink_side_start,venue,venue_link,venue_time_zone_id,venue_time_zone_offset,venue_time_zone_tz
0,2011030221,20112012,P,2012-04-29,2012-04-29T19:00:00Z,1,4,3,4,home win OT,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT
1,2011030222,20112012,P,2012-05-01,2012-05-01T23:30:00Z,1,4,4,1,away win REG,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT
2,2011030223,20112012,P,2012-05-03,2012-05-03T23:30:00Z,4,1,3,4,home win OT,left,Prudential Center,/api/v1/venues/null,America/New_York,-4,EDT
3,2011030224,20112012,P,2012-05-06,2012-05-06T23:30:00Z,4,1,2,4,home win REG,left,Prudential Center,/api/v1/venues/null,America/New_York,-4,EDT
4,2011030225,20112012,P,2012-05-08,2012-05-08T23:30:00Z,1,4,3,1,away win REG,right,Wells Fargo Center,/api/v1/venues/null,America/New_York,-4,EDT


In [52]:
# Carga el dataframe de game_plays

plays_df = pd.read_csv("game_plays.csv.zip")


# select a ~10% sample of the games
plays_df = plays_df \
    .sample(frac=0.1, random_state=1) \
    .drop(
        ['secondaryType', 'periodType', 'dateTime', 'rink_side'], 
        axis=1
    ) \
    .fillna(0)


# convert the remaining strings to integer types via hashing
plays_df['event'] = plays_df['event'].apply(hash)
plays_df['description'] = plays_df['description'].apply(hash)

plays_df.head()

Unnamed: 0,play_id,game_id,play_num,team_id_for,team_id_against,event,x,y,period,periodTime,periodTimeRemaining,goals_away,goals_home,description,st_x,st_y
3323025,2011020033_140,2011020033,140,0.0,0.0,-3352894116208005710,0.0,0.0,2,86,1114,2,0,4303636255104359196,0.0,0.0
1528803,2015020106_258,2015020106,258,24.0,30.0,-2587625740014596625,85.0,22.0,3,317,883,0,3,-792122103302791759,85.0,22.0
2685990,2018020787_305,2018020787,305,54.0,12.0,-2803048138073456343,-87.0,-39.0,3,804,396,2,4,-4174251371639034632,-87.0,-39.0
796099,2013020091_191,2013020091,191,52.0,8.0,-2587625740014596625,87.0,-6.0,2,781,419,2,0,-2956094896739369059,87.0,-6.0
726718,2013020931_239,2013020931,239,23.0,27.0,-2803048138073456343,59.0,-39.0,3,440,760,0,1,2046730178332783446,-59.0,39.0


In [53]:
# create feature encodings for the event and description fields
es = ft.EntitySet(id="plays")
es = es.entity_from_dataframe(
    entity_id="plays", 
    dataframe=plays_df, 
    index="play_id",
    variable_types = { 
        "event": ft.variable_types.Categorical, 
        "description": ft.variable_types.Categorical 
    }
)       
         
f1 = Feature(es["plays"]["event"])
f2 = Feature(es["plays"]["description"])

encoded, _= ft.encode_features(plays_df, [f1, f2], top_n=10)
encoded.reset_index(inplace=True)

# create an entry set of the encoded play data and games
es = ft.EntitySet(id="plays")
es = es.entity_from_dataframe(
    entity_id="plays", 
    dataframe=encoded, 
    index="play_id"
)
es = es.normalize_entity(
    base_entity_id="plays",
    new_entity_id="games", 
    index="game_id"
)

In [54]:
es

Entityset: plays
  Entities:
    plays [Rows: 363235, Columns: 37]
    games [Rows: 11244, Columns: 1]
  Relationships:
    plays.game_id -> games.game_id

In [55]:
es['games']

Entity: games
  Variables:
    game_id (dtype: index)
  Shape:
    (Rows: 11244, Columns: 1)

In [56]:
features, defs = ft.dfs(
    entityset=es, 
    target_entity="games", 
    max_depth = 2
)
features.reset_index(inplace=True)
features.shape

(11244, 102)

In [57]:
# assign labels to the generated features
train_data = features.merge(game_df[['game_id', 'type']], on="game_id")
train_data.groupby('type').type.count()

type
P      784
R    10460
Name: type, dtype: int64

In [58]:
train_data

Unnamed: 0,game_id,COUNT(plays),MAX(plays.goals_away),MAX(plays.goals_home),MAX(plays.index),MAX(plays.period),MAX(plays.periodTime),MAX(plays.periodTimeRemaining),MAX(plays.play_num),MAX(plays.st_x),...,SUM(plays.periodTime),SUM(plays.periodTimeRemaining),SUM(plays.play_num),SUM(plays.st_x),SUM(plays.st_y),SUM(plays.team_id_against),SUM(plays.team_id_for),SUM(plays.x),SUM(plays.y),type
0,2011020033,37,3,3,3323274,4,1197,1182,389,99.0,...,21607,21893,6570,638.0,199.0,645.0,603.0,-236.0,27.0,R
1,2015020106,37,0,3,1528842,3,1200,1200,297,98.0,...,20845,23555,5768,326.0,-52.0,690.0,660.0,264.0,0.0,R
2,2018020787,30,2,5,2686016,3,1200,1184,331,99.0,...,20898,15102,5118,-59.0,-125.0,516.0,936.0,283.0,-27.0,R
3,2013020091,45,3,0,796241,3,1200,1200,333,99.0,...,25286,28714,6488,103.0,54.0,1088.0,1132.0,365.0,-20.0,R
4,2013020931,35,0,1,726784,3,1159,1193,305,96.0,...,20561,21439,5980,754.0,-92.0,731.0,719.0,-296.0,36.0,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11239,2010020831,27,0,3,3254463,3,1200,1200,325,98.0,...,16408,15992,3883,93.0,278.0,196.0,182.0,-145.0,20.0,R
11240,2017020425,22,2,5,2048207,3,1200,1152,290,96.0,...,13743,12657,3312,98.0,129.0,546.0,518.0,-146.0,-45.0,R
11241,2011020830,18,5,3,3238054,3,1061,1200,249,81.0,...,7581,14019,2091,-32.0,47.0,299.0,284.0,-150.0,11.0,R
11242,2011020346,27,3,4,3258738,3,1191,1184,309,93.0,...,16646,15754,5220,871.0,0.0,95.0,89.0,-59.0,48.0,R


In [60]:
y = train_data['type']
X = train_data.drop(['type', 'game_id'], axis=1).fillna(0)

In [63]:
lr = LogisticRegression()

model = lr.fit(X, y)
model.score(X, y)

0.9304517965136961

In [64]:

roc_auc_score(y, model.predict_proba(X)[:, 1] )
    

0.9067341085573809