In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft

In [34]:
# let's load the example data set from feature tools

data_dict = ft.demo.load_mock_customer()

data = data_dict["transactions"].merge(
    data_dict["sessions"]).merge(data_dict["customers"])

cols = ['customer_id',
        'transaction_id',
        'transaction_time',
        'amount',
        ]

data = data[cols]

data.head()

Unnamed: 0,customer_id,transaction_id,transaction_time,amount
0,2,298,2014-01-01 00:00:00,127.64
1,2,2,2014-01-01 00:01:05,109.48
2,2,308,2014-01-01 00:02:10,95.06
3,2,116,2014-01-01 00:03:15,78.92
4,2,371,2014-01-01 00:04:20,31.54


In [35]:
print('Number of customers: {}'.format(data['customer_id'].nunique()))
print('Number of transactions: {}'.format(data['transaction_id'].nunique()))

Number of customers: 5
Number of transactions: 500


In [36]:
data.dtypes

customer_id                  int64
transaction_id               int64
transaction_time    datetime64[ns]
amount                     float64
dtype: object

In [37]:
# in order for feature tools to work, we need to create
# entity sets

es = ft.EntitySet(id="customer_data")

# entity set with the transactions
# es.entity_from_dataframe(entity_id='transactions',
#                          dataframe=data,
#                          index="transaction_id",
#                          time_index='transaction_time')

es.add_dataframe(
    dataframe_name='transactions',
    dataframe=data,
    index='transaction_id',
    time_index='transaction_time'
)
# display the entity set
es

Entityset: customer_data
  DataFrames:
    transactions [Rows: 500, Columns: 4]
  Relationships:
    No relationships

In [38]:
# now we indicate that within the entity set
# there is another table, with customer data,
# each customer identified with a unique
# customer id

# es.normalize_entity(base_entity_id="transactions",
#                     new_entity_id="customers",
#                     index="customer_id")

es.normalize_dataframe(
    base_dataframe_name="transactions",
    new_dataframe_name="customers",
    index="customer_id"
)

Entityset: customer_data
  DataFrames:
    transactions [Rows: 500, Columns: 4]
    customers [Rows: 5, Columns: 2]
  Relationships:
    transactions.customer_id -> customers.customer_id

 ## Creating new features from the existing data - without aggregations

In [40]:
# featuretools automatically creates more fearures from those present in the data set
# for every single transaction, that is without aggregation

# the names in the below list, indicate featuretools to:
# 1) Create a feature from the datetime variable signalling
# if the transaction occurred on a weekend
# 2) Determines the cumulative transaction amount,
# 3) Determine the cumulative number of transactions,
# 4) Creates a new feature indicating the time since the previous transaction

# all of this operations occur at a transaction level, that is, transaction
# after transaction

transf_operations = ['is_weekend', 'cum_sum', 'cum_count','time_since_previous']

# set up the dfs from featuretools to return the previous features
# it is important to leave agg_primitives as an empty list, so that featuretools does not
# aggregate the data at a customer level as well

# feature_matrix, features = ft.dfs(entityset=es,
#                       target_entity="transactions",
#                       agg_primitives=[],
#                       trans_primitives=transf_operations,
#                       verbose=True)

feature_matrix, features = ft.dfs(
    entityset=es,
    target_dataframe_name="transactions",
    agg_primitives=[],
    trans_primitives=transf_operations,
    verbose=True,
)

feature_matrix.head()

  trans_primitives: ['cum_count']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.


Built 7 features
Elapsed: 00:00 | Progress: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████


Unnamed: 0_level_0,customer_id,amount,CUM_SUM(amount),IS_WEEKEND(transaction_time),TIME_SINCE_PREVIOUS(transaction_time),customers.IS_WEEKEND(first_transactions_time),customers.TIME_SINCE_PREVIOUS(first_transactions_time)
transaction_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
298,2,127.64,127.64,False,,False,
2,2,109.48,237.12,False,65.0,False,
308,2,95.06,332.18,False,65.0,False,
116,2,78.92,411.1,False,65.0,False,
371,2,31.54,442.64,False,65.0,False,


In [41]:
# let's now extract information about date and time as well

# list with the operations to perform to extract new features from the 
# existing variables
operations = ['year', 'month', 'day', 'hour', 'minute', 'second',
             'is_weekend', 'cum_sum', 'time_since_previous']


# extract the new features
# feature_matrix, features = ft.dfs(entityset=es,
#                       target_entity="transactions",
#                       agg_primitives=[],
#                       trans_primitives=None,
#                       verbose=True)

feature_matrix, features = ft.dfs(
    entityset=es,
    target_dataframe_name="transactions",  # ✅ updated parameter
    agg_primitives=[],
    trans_primitives=[],                   # can use [] instead of None
    verbose=True
)

feature_matrix.head()

Built 2 features
Elapsed: 00:00 | Progress: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████


Unnamed: 0_level_0,customer_id,amount
transaction_id,Unnamed: 1_level_1,Unnamed: 2_level_1
298,2,127.64
2,2,109.48
308,2,95.06
116,2,78.92
371,2,31.54


In [42]:
# original dataframe and derived feature_matrix should have
# same number of observations

data.shape, feature_matrix.shape

((500, 4), (500, 2))

In [17]:
# if we don't indicate the operations to perform to create new features, featuretools will perform
# the default operations, which can be found here:

# https://docs.featuretools.com/en/stable/generated/featuretools.dfs.html#featuretools.dfs

In [18]:
# if we had more than 1 numeric feature in our dataframe, we could create new feautres by addition
# or multiplication as we did in recipe Combining multiple features with statistical operations of Chapter 9,
# Applying Mathematical Computations to Features

# for code on how to do that check this link:
# https://stackoverflow.com/questions/55155371/how-to-use-featuretools-to-create-features-for-a-single-table-with-no-immediate/55172142#55172142

In [43]:
# for more details in the types of features that we can create
# check the supported operations with the code below

primitives = ft.list_primitives()
pd.options.display.max_colwidth = 500
primitives[primitives['type'] == 'transform']

Unnamed: 0,name,type,description,valid_inputs,return_type
65,whitespace_count,transform,Calculates number of whitespaces in a string.,<ColumnSchema (Logical Type = NaturalLanguage)>,<ColumnSchema (Logical Type = IntegerNullable) (Semantic Tags = ['numeric'])>
66,count_string,transform,Determines how many times a given string shows up in a text field.,<ColumnSchema (Logical Type = NaturalLanguage)>,<ColumnSchema (Logical Type = IntegerNullable) (Semantic Tags = ['numeric'])>
67,cumulative_time_since_last_true,transform,Determines the time (in seconds) since the last boolean was `True` given a datetime index column and boolean column,"<ColumnSchema (Logical Type = Boolean)>, <ColumnSchema (Logical Type = Datetime) (Semantic Tags = ['time_index'])>",<ColumnSchema (Logical Type = Double) (Semantic Tags = ['numeric'])>
68,is_in_geobox,transform,Determines if coordinates are inside a box defined by two corner coordinate points.,<ColumnSchema (Logical Type = LatLong)>,<ColumnSchema (Logical Type = BooleanNullable)>
69,greater_than_equal_to,transform,Determines if values in one list are greater than or equal to another list.,"<ColumnSchema (Logical Type = Ordinal: None)>, <ColumnSchema (Logical Type = Datetime)>, <ColumnSchema (Semantic Tags = ['numeric'])>",<ColumnSchema (Logical Type = BooleanNullable)>
...,...,...,...,...,...
205,divide_by_feature,transform,Divides a scalar by each value in the list.,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Semantic Tags = ['numeric'])>
206,diff_datetime,transform,Computes the timedelta between a datetime in a list and the previous datetime in that list.,<ColumnSchema (Logical Type = Datetime)>,<ColumnSchema (Logical Type = Timedelta)>
207,cosine,transform,Computes the cosine of a number.,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Logical Type = Double) (Semantic Tags = ['numeric'])>
208,absolute,transform,Computes the absolute value of a number.,<ColumnSchema (Semantic Tags = ['numeric'])>,<ColumnSchema (Semantic Tags = ['numeric'])>


 ## Creating new features from the existing data - with aggregations

These are the features we can create when we want a flattened view of our dataframe, that is one row per customer.

In [44]:
# differently from the previous case, here we want to aggregate
# the information at customer level, so we want a summary view of
# each customer

# first we aggregate only the existing variables
# that is the transaction amount

# we want the mean and maximum transaction amount per customer

# feature_matrix, features = ft.dfs(entityset=es,
#                       target_entity="customers",
#                       agg_primitives=["mean", 'max'],
#                       trans_primitives=[],
#                       verbose=True)

feature_matrix, features = ft.dfs(
    entityset=es,
    target_dataframe_name="customers",  # ✅ updated parameter
    agg_primitives=["mean", "max"],
    trans_primitives=[],
    verbose=True
)

# dataframe with the new features
feature_matrix

Built 2 features
Elapsed: 00:00 | Progress: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████


  ).agg(to_agg)
  ).agg(to_agg)


Unnamed: 0_level_0,MAX(transactions.amount),MEAN(transactions.amount)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,146.81,77.422366
5,149.02,80.375443
4,149.95,80.070459
1,139.43,71.631905
3,149.15,67.06043


In [45]:
# now we will create a new feature as in the first part of the 
# notebook, and more specifically, we will create the feature
# time_since_previous transaction

# and aggregate the new feature together with the pre-existing
# variable transaction amount, all in one go

# feature_matrix, features = ft.dfs(entityset=es,
#                       target_entity="customers",
#                       agg_primitives=["mean", 'max'],
#                       trans_primitives=['time_since_previous'],
#                       verbose=True)

feature_matrix, features = ft.dfs(
    entityset=es,
    target_dataframe_name="customers",  # ✅ updated parameter
    agg_primitives=["mean", "max"],
    trans_primitives=["time_since_previous"],
    verbose=True
)

# dataframe with the new features
feature_matrix

Built 5 features
Elapsed: 00:00 | Progress: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████


  ).agg(to_agg)
  ).agg(to_agg)


Unnamed: 0_level_0,MAX(transactions.amount),MEAN(transactions.amount),TIME_SINCE_PREVIOUS(first_transactions_time),MAX(transactions.TIME_SINCE_PREVIOUS(transaction_time)),MEAN(transactions.TIME_SINCE_PREVIOUS(transaction_time))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,146.81,77.422366,,65.0,65.0
5,149.02,80.375443,1040.0,65.0,65.0
4,149.95,80.070459,650.0,65.0,65.0
1,139.43,71.631905,975.0,65.0,65.0
3,149.15,67.06043,3315.0,65.0,65.0


In [46]:
# note how the final dataframe contains as many rows as 
# different customers in the data set

data.shape, feature_matrix.shape

((500, 4), (5, 5))

In [47]:
# and now let's create 2 new features and aggregate both
# together with the transaction amount

# feature_matrix, features = ft.dfs(entityset=es,
#                       target_entity="customers",
#                       agg_primitives=["mean", 'max'],
#                       trans_primitives=['cum_sum','time_since_previous'],
#                       verbose=True)

feature_matrix, features = ft.dfs(
    entityset=es,
    target_dataframe_name="customers",   # updated parameter
    agg_primitives=["mean", "max"],
    trans_primitives=["cum_sum", "time_since_previous"],  # make sure names are valid
    verbose=True
)

# dataframe with the new features
feature_matrix

Built 9 features
Elapsed: 00:00 | Progress: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████


  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)
  ).agg(to_agg)


Unnamed: 0_level_0,MAX(transactions.amount),MEAN(transactions.amount),TIME_SINCE_PREVIOUS(first_transactions_time),MAX(transactions.CUM_SUM(amount)),MAX(transactions.TIME_SINCE_PREVIOUS(transaction_time)),MEAN(transactions.CUM_SUM(amount)),MEAN(transactions.TIME_SINCE_PREVIOUS(transaction_time)),CUM_SUM(MAX(transactions.amount)),CUM_SUM(MEAN(transactions.amount))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,146.81,77.422366,,35101.22,65.0,18794.182366,65.0,146.81,77.422366
5,149.02,80.375443,1040.0,34296.39,65.0,24258.206582,65.0,295.83,157.797809
4,149.95,80.070459,650.0,23418.09,65.0,12455.024495,65.0,445.78,237.868267
1,139.43,71.631905,975.0,31291.89,65.0,16566.784048,65.0,585.21,309.500172
3,149.15,67.06043,3315.0,37539.86,65.0,25387.640538,65.0,734.36,376.560602


In [48]:
data.shape, feature_matrix.shape

((500, 4), (5, 9))

In [55]:
import featuretools
print(featuretools.__version__)

1.31.0


 ## Integrating user defined functions

In the remaining lines, instead of using Featuretools pre-coded functions, we will create our own.

In [66]:
# from featuretools import make_trans_primitive, make_agg_primitive
# from featuretools.variable_types import Numeric

# from scipy.signal import find_peaks


# def find_no_peaks(column):
#     peaks, _ = find_peaks(column)
#     return len(peaks)


# def find_no_valleys(column):
#     valleys, _ = find_peaks(1 / column)
#     return len(valleys)


# FindNoPeaks = make_agg_primitive(function=find_no_peaks,
#                                  input_types=[Numeric],
#                                  return_type=Numeric)

# FindNoValleys = make_agg_primitive(function=find_no_valleys,
#                                    input_types=[Numeric],
#                                    return_type=Numeric)


import featuretools as ft
from featuretools.primitives import AggregationPrimitive
from scipy.signal import find_peaks
#from featuretools.variable_types import Number
import numpy as np
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import Double

# Custom aggregation primitive to count peaks
class FindNoPeaks(AggregationPrimitive):
    """Counts peaks in a numeric column"""
    name = "find_no_peaks"
    input_types = [ColumnSchema(logical_type=Double)]
    return_type = ColumnSchema(logical_type=Double)
    uses_calc_time = False

    def get_function(self):
        def count_peaks(x):
            x = np.array(x)
            peaks, _ = find_peaks(x)
            return len(peaks)
        return count_peaks

# Custom aggregation primitive to count valleys
class FindNoValleys(AggregationPrimitive):
    """Counts valleys in a numeric column"""
    name = "find_no_valleys"
    input_types = [ColumnSchema(logical_type=Double)]
    return_type = ColumnSchema(logical_type=Double) 
    uses_calc_time = False

    def get_function(self):
        def count_valleys(x):
            x = np.array(x)
            valleys, _ = find_peaks(1 / x)
            return len(valleys)
        return count_valleys


In [67]:
# and now let's use our 2 new aggregation functions with mean and max

feature_matrix, features = ft.dfs( entityset=es,
    target_dataframe_name="customers",  # ✅ replaces old target_entity
    agg_primitives=[FindNoPeaks, FindNoValleys, "mean", "max"],  # ✅ primitives can be custom or built-in
    trans_primitives=[],
    verbose=True)

# dataframe with the new features
feature_matrix

  agg_primitives: ['find_no_peaks', 'find_no_valleys']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data. If the DFS call contained multiple instances of a primitive in the list above, none of them were used.


Built 2 features
Elapsed: 00:00 | Progress: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████


  ).agg(to_agg)
  ).agg(to_agg)


Unnamed: 0_level_0,MAX(transactions.amount),MEAN(transactions.amount)
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,146.81,77.422366
5,149.02,80.375443
4,149.95,80.070459
1,139.43,71.631905
3,149.15,67.06043
