<center>
<p><img src="https://mcd.unison.mx/wp-content/themes/awaken/img/logo_mcd.png" width="150">
</p>



<h1>Curso Ingeniería de Características</h1>

<h3>Predicting NHL Playoff Games from Event Data</h3>


<p> Julio Waissman Vilanova </p>
<p>
<img src="https://identidadbuho.unison.mx/wp-content/uploads/2019/06/letragrama-cmyk-72.jpg" width="150">
</p>


<a target="_blank" href="https://colab.research.google.com/github/mcd-unison/ing-caract/blob/main/ejemplos/featuretools/NHL_Games.ipynb"><img src="https://i.ibb.co/2P3SLwK/colab.png"  style="padding-bottom:5px;"  width="30" /> Ejecuta en Colab</a>

</center>


**Ejemplo tomado de la librería [Featuretools](https://www.featuretools.com)**

In [None]:
#!conda install featuretools=0.24.0  -c conda-forge -y 
!pip install featuretools=0.24.0

In [None]:
# Manejo de dataframes
import pandas as pd

# Manejo de datos en forma de strings
import hashlib

# FeatureTools
import featuretools as ft
from featuretools import Feature 

# Aprendizaje máquina
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [None]:
# Carga el dataframe de games

url = "https://github.com/mcd-unison/ing-caract/raw/main/ejemplos/featuretools/game.csv.zip"
game_df = pd.read_csv(url)
game_df.head()

In [None]:
# Carga el dataframe de game_plays

plays_df = pd.read_csv("https://github.com/mcd-unison/ing-caract/raw/main/ejemplos/featuretools/game_plays.csv.zip")

plays_df

In [None]:
#pd.np.random.seed(3)
print(pd.np.random.random())
print(pd.np.random.randn())

In [None]:
# select a ~10% sample of the games
plays_df = plays_df \
    .sample(frac=0.1, random_state=1) \
    .drop(
        ['secondaryType', 'periodType', 'dateTime', 'rink_side'], 
        axis=1
    ) \
    .fillna(0)


# convert the remaining strings to integer types via hashing
plays_df['event'] = plays_df.event.apply(hash)
plays_df['description'] = plays_df.description.apply(hash)

plays_df.head()

In [None]:
# create feature encodings for the event and description fields
es = ft.EntitySet(id="plays")

es = es.entity_from_dataframe(
    entity_id="plays", 
    dataframe=plays_df, 
    index="play_id",
    variable_types = { 
        "event": ft.variable_types.Categorical, 
        "description": ft.variable_types.Categorical 
    }
)       
         
f1 = Feature(es["plays"]["event"])
f2 = Feature(es["plays"]["description"])

encoded, _= ft.encode_features(plays_df, [f1, f2], top_n=10)
encoded.reset_index(inplace=True)

# create an entry set of the encoded play data and games
es = ft.EntitySet(id="plays")
es = es.entity_from_dataframe(
    entity_id="plays", 
    dataframe=encoded, 
    index="play_id"
)
es = es.normalize_entity(
    base_entity_id="plays",
    new_entity_id="games", 
    index="game_id"
)

In [None]:
encoded.columns

In [None]:
es.add_dataframe?

In [None]:
es['games']

In [None]:
features, defs = ft.dfs(
    entityset=es, 
    target_entity="games", 
    max_depth = 2
)
features.reset_index(inplace=True)
features.shape

In [None]:
features

In [None]:
ft.describe_feature(defs[2])

In [None]:
# assign labels to the generated features
train_data = features.merge(game_df[['game_id', 'type']], on="game_id")
train_data.groupby('type').type.count()

In [None]:
train_data

In [None]:
y = train_data['type']
X = train_data.drop(['type', 'game_id'], axis=1)

In [None]:
lr = LogisticRegression()

model = lr.fit(X, y)
model.score(X, y)

In [None]:
roc_auc_score(y, model.predict_proba(X)[:, 1] )