## Defining Entities and Establishing Relationships

In [15]:
import pandas as pd
import numpy as np
import featuretools as ft

In [16]:
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter17/Datasets/bank-full.csv'
bankData = pd.read_csv(file_url,sep=";")
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [17]:
y = bankData.pop('y')

In [18]:
#IDs para a entidade demográfica
bankData['custID'] = bankData.index.values
bankData['custID'] = 'cust' + bankData['custID'].astype(str)

In [19]:
bankData['AssertID'] = 0
bankData.loc[bankData.housing == 'yes', 'AssetID'] = 1

In [20]:
bankData['LoanID'] = 0
bankData.loc[bankData.loan == 'yes', 'LoadID'] = 1

In [21]:
bankData['FinbehID'] = 0
bankData.loc[bankData.default == 'yes', ['FinbehID']] = 1

In [22]:
bankData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,campaign,pdays,previous,poutcome,custID,AssertID,AssetID,LoanID,LoadID,FinbehID
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,1,-1,0,unknown,cust0,0,1.0,0,,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,1,-1,0,unknown,cust1,0,1.0,0,,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,1,-1,0,unknown,cust2,0,1.0,0,1.0,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,1,-1,0,unknown,cust3,0,1.0,0,,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,1,-1,0,unknown,cust4,0,,0,,0


In [23]:
# Criando a entidade
bankEntities = ft.EntitySet(id = 'Bank')

In [24]:
bankEntities.entity_from_dataframe(entity_id='Demographic Data', dataframe=bankData, index='custID')

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 22]
  Relationships:
    No relationships

In [25]:
bankEntities.normalize_entity(base_entity_id='Demographic Data', new_entity_id='Assets', index='AssetID', additional_variables=['housing'])

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 21]
    Assets [Rows: 2, Columns: 2]
  Relationships:
    Demographic Data.AssetID -> Assets.AssetID

In [26]:
bankEntities.normalize_entity(base_entity_id='Demographic Data', new_entity_id='Liability', index='LoanID', additional_variables=['loan'])

bankEntities.normalize_entity(base_entity_id='Demographic Data', new_entity_id='FinBehaviour', index='FinbehID', additional_variables=['default'])


Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 19]
    Assets [Rows: 2, Columns: 2]
    Liability [Rows: 1, Columns: 2]
    FinBehaviour [Rows: 2, Columns: 2]
  Relationships:
    Demographic Data.AssetID -> Assets.AssetID
    Demographic Data.LoanID -> Liability.LoanID
    Demographic Data.FinbehID -> FinBehaviour.FinbehID

## Creating New Features Using Deep Feature Synthesis

In [27]:
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter17/Datasets/bank-full.csv'
bankData = pd.read_csv(file_url,sep=";")

y = bankData.pop('y')

bankData['custID'] = bankData.index.values
bankData['custID'] = 'cust' + bankData['custID'].astype(str)

bankData['AssertID'] = 0
bankData.loc[bankData.housing == 'yes', 'AssetID'] = 1

bankData['LoanID'] = 0
bankData.loc[bankData.loan == 'yes', 'LoadID'] = 1

bankData['FinbehID'] = 0
bankData.loc[bankData.default == 'yes', ['FinbehID']]

bankEntities = ft.EntitySet(id = 'Bank')
bankEntities.entity_from_dataframe(entity_id='Demographic Data', dataframe=bankData, index='custID')
bankEntities.normalize_entity(base_entity_id='Demographic Data', new_entity_id='Assets', index='AssetID', additional_variables=['housing'])

bankEntities.normalize_entity(base_entity_id='Demographic Data', new_entity_id='Liability', index='LoanID', additional_variables=['loan'])
bankEntities.normalize_entity(base_entity_id='Demographic Data', new_entity_id='FinBehaviour', index='FinbehID', 
                              additional_variables=['default'])

Entityset: Bank
  Entities:
    Demographic Data [Rows: 45211, Columns: 19]
    Assets [Rows: 2, Columns: 2]
    Liability [Rows: 1, Columns: 2]
    FinBehaviour [Rows: 1, Columns: 2]
  Relationships:
    Demographic Data.AssetID -> Assets.AssetID
    Demographic Data.LoanID -> Liability.LoanID
    Demographic Data.FinbehID -> FinBehaviour.FinbehID

In [29]:
feature_set, feature_names = ft.dfs(entityset=bankEntities, 
target_entity = 'Demographic Data', 
max_depth = 2, 
verbose = 1, 
n_jobs = 1)

Built 234 features
Elapsed: 00:02 | Progress: 100%|██████████


In [31]:
feature_set = feature_set.reindex(index=bankData['custID'])
feature_set = feature_set.reset_index()

In [32]:
feature_set.head()

Unnamed: 0,custID,age,job,marital,education,balance,contact,day,month,duration,...,FinBehaviour.NUM_UNIQUE(Demographic Data.month),FinBehaviour.NUM_UNIQUE(Demographic Data.AssetID),FinBehaviour.MODE(Demographic Data.LoanID),FinBehaviour.MODE(Demographic Data.marital),FinBehaviour.MODE(Demographic Data.job),FinBehaviour.MODE(Demographic Data.poutcome),FinBehaviour.MODE(Demographic Data.contact),FinBehaviour.MODE(Demographic Data.education),FinBehaviour.MODE(Demographic Data.month),FinBehaviour.MODE(Demographic Data.AssetID)
0,cust0,58,management,married,tertiary,2143,unknown,5,may,261,...,12,1.0,0,married,blue-collar,unknown,cellular,secondary,may,1.0
1,cust1,44,technician,single,secondary,29,unknown,5,may,151,...,12,1.0,0,married,blue-collar,unknown,cellular,secondary,may,1.0
2,cust2,33,entrepreneur,married,secondary,2,unknown,5,may,76,...,12,1.0,0,married,blue-collar,unknown,cellular,secondary,may,1.0
3,cust3,47,blue-collar,married,unknown,1506,unknown,5,may,92,...,12,1.0,0,married,blue-collar,unknown,cellular,secondary,may,1.0
4,cust4,33,unknown,single,unknown,1,unknown,5,may,198,...,12,1.0,0,married,blue-collar,unknown,cellular,secondary,may,1.0


In [33]:
feature_names

[<Feature: age>,
 <Feature: job>,
 <Feature: marital>,
 <Feature: education>,
 <Feature: balance>,
 <Feature: contact>,
 <Feature: day>,
 <Feature: month>,
 <Feature: duration>,
 <Feature: campaign>,
 <Feature: pdays>,
 <Feature: previous>,
 <Feature: poutcome>,
 <Feature: AssertID>,
 <Feature: AssetID>,
 <Feature: LoanID>,
 <Feature: LoadID>,
 <Feature: FinbehID>,
 <Feature: Assets.housing>,
 <Feature: Liability.loan>,
 <Feature: FinBehaviour.default>,
 <Feature: Assets.SUM(Demographic Data.AssertID)>,
 <Feature: Assets.SUM(Demographic Data.previous)>,
 <Feature: Assets.SUM(Demographic Data.age)>,
 <Feature: Assets.SUM(Demographic Data.balance)>,
 <Feature: Assets.SUM(Demographic Data.pdays)>,
 <Feature: Assets.SUM(Demographic Data.LoadID)>,
 <Feature: Assets.SUM(Demographic Data.duration)>,
 <Feature: Assets.SUM(Demographic Data.day)>,
 <Feature: Assets.SUM(Demographic Data.campaign)>,
 <Feature: Assets.STD(Demographic Data.AssertID)>,
 <Feature: Assets.STD(Demographic Data.previous)

In [43]:
# Definindo transformações primitivas
aggPrimitives=[
        'std', 'min', 'max', 'mean', 
        'last', 'count'
]
tranPrimitives=[
        'percentile']

In [44]:
feature_set, feature_names = ft.dfs(entityset=bankEntities, 
target_entity = 'Demographic Data',
agg_primitives=aggPrimitives,
trans_primitives=tranPrimitives, 
max_depth = 2, 
verbose = 1, 
n_jobs = 1)

Built 195 features
Elapsed: 00:02 | Progress: 100%|██████████


In [45]:
feature_set.head()

Unnamed: 0_level_0,age,job,marital,education,balance,contact,day,month,duration,campaign,...,FinBehaviour.LAST(Demographic Data.custID),FinBehaviour.LAST(Demographic Data.education),FinBehaviour.LAST(Demographic Data.month),FinBehaviour.LAST(Demographic Data.pdays),FinBehaviour.LAST(Demographic Data.AssetID),FinBehaviour.LAST(Demographic Data.LoadID),FinBehaviour.LAST(Demographic Data.duration),FinBehaviour.LAST(Demographic Data.day),FinBehaviour.LAST(Demographic Data.campaign),FinBehaviour.COUNT(Demographic Data)
custID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
cust0,58,management,married,tertiary,2143,unknown,5,may,261,1,...,cust45210,secondary,nov,188,,,361,17,2,45211
cust1,44,technician,single,secondary,29,unknown,5,may,151,1,...,cust45210,secondary,nov,188,,,361,17,2,45211
cust2,33,entrepreneur,married,secondary,2,unknown,5,may,76,1,...,cust45210,secondary,nov,188,,,361,17,2,45211
cust3,47,blue-collar,married,unknown,1506,unknown,5,may,92,1,...,cust45210,secondary,nov,188,,,361,17,2,45211
cust4,33,unknown,single,unknown,1,unknown,5,may,198,1,...,cust45210,secondary,nov,188,,,361,17,2,45211
