# Automated feature engineering with [Deep Feature Synthesis]( https://docs.featuretools.com/)

Learning resources:
- https://github.com/WillKoehrsen/automated-feature-engineering/blob/master/walk_through/Automated_Feature_Engineering.ipynb
- https://towardsdatascience.com/automated-feature-engineering-in-python-99baf11cc219

In [1]:
import numpy as np
import pandas as pd
import featuretools as ft
import time

In [2]:
# utils
class Timer:
    def __init__(self, descr='Execution time'):
        self.ts = None
        self.descr = descr

    def __enter__(self):
        self.ts = time.time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        timing = '%r  %2.2f minutes' % (self.descr, (time.time() - self.ts) / 60)
        print(timing)

### Define databse

In [3]:
players = pd.DataFrame({'player_id': [0,1,2,3], 'name': ['John', 'Rose', 'Amber', 'Tom']})

transfers = pd.DataFrame({'transfer_id': np.arange(15), 
                          'player_id': np.random.choice([0,1,2,3], size=15), 
                          'year': np.random.choice(np.arange(2000, 2017), size=15), 
                          'contract_value ($)': np.random.choice(np.arange(2e5, 1e6, 1e3), size=15)})

all_games = pd.DataFrame({'game_id': [0,1,2,3,4,5,6,7], 
                          'player_id': [0,3,3,1,2,2,1, 2],
                          'opponent': [2,1,0,2,1,3,0, 1],
                          'scored_points': [10,11,8,3,5,11,0,12],
                          'lost_points': [3,8,9,15, 7, 15, 8, 13]
                         })

all_points = np.sum([all_games['scored_points'], all_games['lost_points']])

game_details = pd.DataFrame({'detail_id': np.arange(all_points),
                             'game_id': np.random.choice(all_games['game_id'], all_points),
                             'score': np.random.choice([-1,2,3,4], all_points),
                             'timestamp': np.random.randint(low=10e9, high=10e15, size=(all_points,)) +(time.time()*10e8)})
game_details['timestamp'] =pd.to_datetime(game_details['timestamp'])

In [4]:
players

Unnamed: 0,player_id,name
0,0,John
1,1,Rose
2,2,Amber
3,3,Tom


In [5]:
transfers

Unnamed: 0,transfer_id,player_id,year,contract_value ($)
0,0,0,2007,282000.0
1,1,0,2012,842000.0
2,2,0,2014,435000.0
3,3,1,2013,910000.0
4,4,0,2014,398000.0
5,5,3,2013,863000.0
6,6,2,2011,691000.0
7,7,0,2006,705000.0
8,8,2,2006,897000.0
9,9,2,2000,953000.0


In [6]:
all_games

Unnamed: 0,game_id,player_id,opponent,scored_points,lost_points
0,0,0,2,10,3
1,1,3,1,11,8
2,2,3,0,8,9
3,3,1,2,3,15
4,4,2,1,5,7
5,5,2,3,11,15
6,6,1,0,0,8
7,7,2,1,12,13


In [7]:
game_details.head()

Unnamed: 0,detail_id,game_id,score,timestamp
0,0,4,2,2018-09-26 17:40:31.295806976
1,1,5,4,2018-10-01 16:38:42.781442048
2,2,2,3,2018-09-11 02:17:19.870081280
3,3,7,-1,2018-11-24 21:04:33.298283776
4,4,2,4,2018-12-15 23:33:55.886451200


## Define new data structure

#### EntitySet - is a collection of tables (entities) and the relationships between them

In [8]:
es = ft.EntitySet(id='database')

In [9]:
es

Entityset: database
  Entities:
  Relationships:
    No relationships

##### Create new entities (tables) from dataframes and add them to collection (EntitySet)

In [10]:
es.entity_from_dataframe(entity_id='players', dataframe=players, index='player_id')
es.entity_from_dataframe(entity_id='transfers', dataframe=transfers, index='transfer_id')
es.entity_from_dataframe(entity_id='all_games', dataframe=all_games, index='game_id')

Entityset: database
  Entities:
    players [Rows: 4, Columns: 2]
    transfers [Rows: 15, Columns: 4]
    all_games [Rows: 8, Columns: 5]
  Relationships:
    No relationships

__ft__ automatically defines variable types however, identifying the correct variable types is important because Featuretools applies different operations to different data types (just as we do when manual feature engineering).

In [11]:
es.entity_from_dataframe(entity_id='game_details', dataframe=game_details, 
                         index='detail_id', variable_types={
                             'timestamp': ft.variable_types.DatetimeTimeIndex
                         })

Entityset: database
  Entities:
    players [Rows: 4, Columns: 2]
    transfers [Rows: 15, Columns: 4]
    all_games [Rows: 8, Columns: 5]
    game_details [Rows: 138, Columns: 4]
  Relationships:
    No relationships

In [12]:
es['game_details'].variables

[<Variable: detail_id (dtype = index)>,
 <Variable: game_id (dtype = numeric)>,
 <Variable: score (dtype = numeric)>,
 <Variable: timestamp (dtype: datetime_time_index, format: None)>]

##### Define & apply relations between tables (entities)
``` __init__(parent_variable, child_variable) ```

In [13]:
linking_id = 'player_id'

parent_variable=es['players'][linking_id]
child_variable=es['transfers'][linking_id]

players_transfers_relationship = ft.Relationship(parent_variable, child_variable)

In [14]:
relationships = [players_transfers_relationship,
                 ft.Relationship(es['players']['player_id'], es['all_games']['player_id']),
                 ft.Relationship(es['all_games']['game_id'], es['game_details']['game_id'])
                ]
es.add_relationships(relationships)

Entityset: database
  Entities:
    players [Rows: 4, Columns: 2]
    transfers [Rows: 15, Columns: 4]
    all_games [Rows: 8, Columns: 5]
    game_details [Rows: 138, Columns: 4]
  Relationships:
    transfers.player_id -> players.player_id
    all_games.player_id -> players.player_id
    game_details.game_id -> all_games.game_id

## Generate new features

- Featuretools will automatically create many features from a set of related tables - using operations: transofmrations and aggregations.
- Deep feature synthesis (dfs) - stacks multiple transformation and aggregation operations across tables. Its depth is defined by parameter. 

Operations in featuretools ecosystem are called [feature-primitives](https://docs.featuretools.com/automated_feature_engineering/primitives.html)
- Transformation - acts on a single table by creating new features out of one or more of the existing columns. 
- Aggregations - using one-to-many relationship groups observations and then calculate statistics.


In [15]:
agg_primitives = ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "mode"] # defaults
trans_primitives = ['time_since_previous', "year",'cum_count', 'diff', 'and', 'or']

__Note:__ When dfs is called with features_only=True, only feature definitions are returned as output. By default this parameter is set to False. This parameter is used quickly inspect the feature definitions before the spending time calculating the feature matrix.

In [16]:
with Timer():
    feature_defs = ft.dfs(entityset=es,
                          features_only=True,
                          target_entity='players',
                          max_depth=4,
                          agg_primitives=agg_primitives,
                          trans_primitives=trans_primitives)

'Execution time'  0.20 minutes


In [17]:
with Timer():
    feature_matrix, feature_defs = ft.dfs(entityset=es,
                          features_only=False,
                          target_entity='players',
                          max_depth=4,
                          agg_primitives=agg_primitives,
                          trans_primitives=trans_primitives)

'Execution time'  0.24 minutes


#### Retured value is a Dataframe with new features.
Not all of them will be crucial for model ale even make sense, however using this tool we can leverage automation. 

In [18]:
feature_matrix

Unnamed: 0_level_0,name,SUM(transfers.year),SUM(transfers.contract_value ($)),STD(transfers.year),STD(transfers.contract_value ($)),MAX(transfers.year),MAX(transfers.contract_value ($)),SKEW(transfers.year),SKEW(transfers.contract_value ($)),MIN(transfers.year),...,MEAN(all_games.DIFF(STD(game_details.DIFF(score by game_id)) by player_id)),MEAN(all_games.DIFF(MEAN(game_details.CUM_COUNT(detail_id by game_id)) by player_id)),MEAN(all_games.DIFF(MEAN(game_details.DIFF(score by game_id)) by player_id)),MEAN(all_games.DIFF(SUM(game_details.time_since_previous_by_game_id) by player_id)),MEAN(all_games.DIFF(MIN(game_details.DIFF(score by game_id)) by player_id)),MEAN(all_games.DIFF(MEAN(game_details.time_since_previous_by_game_id) by player_id)),MEAN(all_games.DIFF(STD(game_details.CUM_COUNT(detail_id by game_id)) by player_id)),MEAN(all_games.DIFF(MAX(game_details.DIFF(score by game_id)) by player_id)),MEAN(all_games.DIFF(SKEW(game_details.time_since_previous_by_game_id) by player_id)),MEAN(all_games.DIFF(SKEW(game_details.DIFF(score by game_id)) by player_id))
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,John,12054,3546000.0,4.760952,230547.175216,2014,884000.0,-0.416995,0.02665,2001,...,,,,,,,,,,
1,Rose,4026,1495000.0,0.0,162500.0,2013,910000.0,0.0,0.0,2013,...,0.191535,0.5,0.0,-918401.9,0.0,-24714.153038,0.289056,1.0,,
2,Amber,6017,2541000.0,4.496913,112652.859114,2011,953000.0,-0.11078,-0.578327,2000,...,0.317837,-0.5,0.125,-73391.27,0.5,-15435.513384,-0.28969,0.5,,
3,Tom,8030,2400000.0,4.769696,211446.210654,2013,863000.0,-0.186618,-0.038367,2001,...,0.619988,4.0,0.025974,8649261.0,-1.0,410506.14276,2.312756,1.0,,
