# Predicting NHL Playoff Games from Event Data


In [None]:
!conda install featuretools -c conda-forge -y

In [None]:
# Manejo de dataframes
import pandas as pd

# Manejo de datos en forma de strings
import hashlib

# FeatureTools
import featuretools as ft
from featuretools import Feature 

# Aprendizaje máquina
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [None]:
# Carga el dataframe de games

game_df = pd.read_csv("game.csv.zip")
game_df.head()

In [None]:
# Carga el dataframe de game_plays

plays_df = pd.read_csv("game_plays.csv.zip")

plays_df

In [None]:
# select a ~10% sample of the games
plays_df = plays_df \
    .sample(frac=0.1, random_state=1) \
    .drop(
        ['secondaryType', 'periodType', 'dateTime', 'rink_side'], 
        axis=1
    ) \
    .fillna(0)


# convert the remaining strings to integer types via hashing
plays_df.event = plays_df.event.apply(hash)
plays_df.description = plays_df.description.apply(hash)

plays_df.head()

In [None]:
# create feature encodings for the event and description fields
es = ft.EntitySet(id="plays")

es = es.entity_from_dataframe(
    entity_id="plays", 
    dataframe=plays_df, 
    index="play_id",
    variable_types = { 
        "event": ft.variable_types.Categorical, 
        "description": ft.variable_types.Categorical 
    }
)       
         
f1 = Feature(es["plays"]["event"])
f2 = Feature(es["plays"]["description"])

encoded, _= ft.encode_features(plays_df, [f1, f2], top_n=10)
encoded.reset_index(inplace=True)

# create an entry set of the encoded play data and games
es = ft.EntitySet(id="plays")
es = es.entity_from_dataframe(
    entity_id="plays", 
    dataframe=encoded, 
    index="play_id"
)
es = es.normalize_entity(
    base_entity_id="plays",
    new_entity_id="games", 
    index="game_id"
)

In [None]:
es

In [None]:
es['games']

In [None]:
features, defs = ft.dfs(
    entityset=es, 
    target_entity="games", 
    max_depth = 2
)
features.reset_index(inplace=True)
features.shape

In [None]:
ft.describe_feature(defs[51])

In [None]:
# assign labels to the generated features
train_data = features.merge(game_df[['game_id', 'type']], on="game_id")
train_data.groupby('type').type.count()

In [None]:
train_data

In [None]:
y = train_data['type']
X = train_data.drop(['type', 'game_id'], axis=1)

In [None]:
lr = LogisticRegression()

model = lr.fit(X, y)
model.score(X, y)

In [None]:
roc_auc_score(y, model.predict_proba(X)[:, 1] )
    