# Automatic Feature Engineering with Featuretools
by Matthew Emery

In [15]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
import featuretools as ft
from random import sample
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, StandardScaler
from mlxtend.classifier import StackingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from hpsklearn import HyperoptEstimator, any_classifier, xgboost_classification, sgd

In [2]:
train_df = pd.read_csv("data/train.csv")
train_target = train_df["Survived"]
train_features = train_df.drop(columns="Survived")
train_features['Pclass'] = train_features['Pclass'].astype('category')
train_features.sample(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
885,886,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.125,,Q
805,806,3,"Johansson, Mr. Karl Johan",male,31.0,0,0,347063,7.775,,S
335,336,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S
483,484,3,"Turkula, Mrs. (Hedwig)",female,63.0,0,0,4134,9.5875,,S
397,398,2,"McKane, Mr. Peter David",male,46.0,0,0,28403,26.0,,S


In [3]:
basic_features = pd.get_dummies(train_features.drop(columns=["Name", "PassengerId", "Ticket", "Cabin"]))

dummy_model = DummyClassifier()
dummy_cv = cross_val_score(dummy_model, basic_features, train_target, cv=10)
dummy_cv.mean(), dummy_cv.std()

(0.5274287822040631, 0.05693270836044469)

In [16]:
basic_model = make_pipeline(XGBClassifier())
basic_cv = cross_val_score(basic_model, basic_features, train_target, cv=10)
basic_cv.mean(), basic_cv.std()

(0.8261133810010215, 0.0318043494707937)

In [17]:
#How about some feature engineering?
ft_features = train_features.drop(columns=["Name", "Ticket", "Cabin"])
es = ft.EntitySet(id="titanic")
es.entity_from_dataframe(entity_id="titanic",
                         dataframe=ft_features,
                         index="PassengerId",
                         variable_types={
                             "Pclass": ft.variable_types.Categorical,
                             "Sex": ft.variable_types.Categorical,
                             "Embarked": ft.variable_types.Categorical
                           })
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="classes",
                index="Pclass")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="sexes",
                index="Sex")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="embarkeds",
                index="Embarked")

ValueError: labels ['Name' 'Ticket' 'Cabin'] not contained in axis

In [7]:
feature_matrix, features = ft.dfs(entityset=es,
                              target_entity="titanic",
                              save_progress="results/",
                              verbose=True)

feature_matrix, features = ft.encode_features(feature_matrix, features)

feature_matrix.sample(5)

Built 94 features
Elapsed: 00:00 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 1/1 cutoff times


Unnamed: 0_level_0,Age,SibSp,Parch,Fare,Pclass = 3,Pclass = 1,Pclass = 2,Pclass = unknown,Sex = male,Sex = female,...,embarkeds.MEAN(titanic.Parch),embarkeds.MEAN(titanic.Fare),embarkeds.COUNT(titanic),embarkeds.NUM_UNIQUE(titanic.Pclass),embarkeds.NUM_UNIQUE(titanic.Sex),embarkeds.MODE(titanic.Pclass) = 3.0,embarkeds.MODE(titanic.Pclass) = 1.0,embarkeds.MODE(titanic.Pclass) = unknown,embarkeds.MODE(titanic.Sex) = male,embarkeds.MODE(titanic.Sex) = unknown
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
43,,0,0,7.8958,1,0,0,0,1,0,...,0.363095,59.954144,168.0,3.0,2.0,0,1,0,1,0
874,47.0,0,0,9.0,1,0,0,0,1,0,...,0.413043,27.079812,644.0,3.0,2.0,1,0,0,1,0
250,54.0,1,0,26.0,0,0,1,0,1,0,...,0.413043,27.079812,644.0,3.0,2.0,1,0,0,1,0
418,18.0,0,2,13.0,0,0,1,0,0,1,...,0.413043,27.079812,644.0,3.0,2.0,1,0,0,1,0
145,18.0,0,0,11.5,0,0,1,0,1,0,...,0.413043,27.079812,644.0,3.0,2.0,1,0,0,1,0


In [8]:
sample(features, 5)

[<Feature: sexes.SUM(titanic.Fare)>,
 <Feature: sexes.STD(titanic.SibSp)>,
 <Feature: classes.NUM_UNIQUE(titanic.Sex)>,
 <Feature: Pclass = 1>,
 <Feature: embarkeds.MODE(titanic.Sex) = male>]

In [9]:
ft_cv = cross_val_score(basic_model, feature_matrix, train_target, cv=10)
ft_cv.mean(), ft_cv.std()

(0.8003084212915672, 0.02442933049615746)

In [10]:
train_features['CabinClass'] = train_features["Cabin"].str.get(0)
train_features['LastName'] = train_features['Name'].str.split(", ").apply(lambda x: x[0])
train_features['Honorific'] = train_features['Name'].str.extract(" ([a-zA-z]+)")
train_features['TicketPrefix'] = train_features['Ticket'].str.extract("(.+) ")
train_features['TicketNumber'] = train_features['Ticket'].str.extract("([0-9]+)$").astype(float)
train_features = train_features.drop(columns=["Name", "Cabin", "Ticket"])

In [11]:
es = ft.EntitySet(id="titanic")
es.entity_from_dataframe(entity_id="titanic",
                         dataframe=train_features,
                         index="PassengerId",
                         variable_types={
                             "Pclass": ft.variable_types.Categorical,
                             "Sex": ft.variable_types.Categorical,
                             "Embarked": ft.variable_types.Categorical,
                             "CabinClass": ft.variable_types.Categorical,
                             "LastName": ft.variable_types.Categorical,
                             "Honorific": ft.variable_types.Categorical,
                             "TicketPrefix": ft.variable_types.Categorical
                           })
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="classes",
                index="Pclass")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="sexes",
                index="Sex")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="embarkeds",
                index="Embarked")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="cabinclasses",
                index="CabinClass")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="honorfics",
                index="Honorific")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="lastnames",
                index="LastName")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="ticketprefixes",
                index="TicketPrefix")

feature_matrix, features = ft.dfs(entityset=es,
                              target_entity="titanic",
                              save_progress="results/",
                              verbose=True)

feature_matrix, features = ft.encode_features(feature_matrix, features)

feature_matrix.sample(5)

Built 313 features
Elapsed: 00:03 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 1/1 cutoff times


Unnamed: 0_level_0,Age,SibSp,Parch,Fare,TicketNumber,Pclass = 3,Pclass = 1,Pclass = 2,Pclass = unknown,Sex = male,...,ticketprefixes.MODE(titanic.LastName) = Corn,ticketprefixes.MODE(titanic.LastName) = Hakkarainen,ticketprefixes.MODE(titanic.LastName) = unknown,ticketprefixes.MODE(titanic.Honorific) = Mr,ticketprefixes.MODE(titanic.Honorific) = Miss,ticketprefixes.MODE(titanic.Honorific) = Master,ticketprefixes.MODE(titanic.Honorific) = Capt,ticketprefixes.MODE(titanic.Honorific) = Rev,ticketprefixes.MODE(titanic.Honorific) = Mrs,ticketprefixes.MODE(titanic.Honorific) = unknown
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
816,,0,0,0.0,112058.0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,1
687,14.0,4,1,39.6875,3101295.0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
353,15.0,1,1,7.2292,2695.0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
535,30.0,0,0,8.6625,315084.0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
868,31.0,0,0,50.4958,17590.0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0


In [12]:
expanded_ft_cv = cross_val_score(basic_model, feature_matrix, train_target, cv=10)
expanded_ft_cv.mean(), expanded_ft_cv.std()

(0.8070128248779934, 0.05004947107973656)

In [13]:
pipeline = make_pipeline(
    Imputer(strategy='median'),
    SelectFromModel(ExtraTreesClassifier(), 'median'),
    XGBClassifier()
)

selected_ft_cv = cross_val_score(pipeline, feature_matrix, train_target, cv=10)
selected_ft_cv.mean(), selected_ft_cv.std()

NameError: name 'XGBClassifier' is not defined

In [74]:
hyperopt_pipeline = make_pipeline(
    Imputer(strategy='median'),
    SelectFromModel(ExtraTreesClassifier(), 'median'),
    HyperoptEstimator(
        classifier=any_classifier(name="sgd"),
    )
)

hyperopt_ft_cv = cross_val_score(hyperopt_pipeline, feature_matrix, train_target, cv=10)
hyperopt_ft_cv.mean(), hyperopt_ft_cv.std()

AttributeError: 'NoneType' object has no attribute 'randint'

In [77]:
prepro_pipeline = make_pipeline(Imputer(strategy='median'), 
                   SelectFromModel(ExtraTreesClassifier(), 'median'))

prepro_feature_matrix = prepro_pipeline.fit_transform(feature_matrix)
sgd_fit = HyperoptEstimator(
    classifier=sgd(name="sgd"),
).fit(prepro_feature_matrix)

TypeError: Singleton array array(None, dtype=object) cannot be considered a valid collection.

In [76]:
prepro_pipeline = (Imputer(strategy='median'), 
                   SelectFromModel(ExtraTreesClassifier(), 'median'))

stacked_pipeline = make_pipeline(
    *prepro_pipeline,
    StackingClassifier(
        classifiers=[
            CalibratedClassifierCV(
                make_pipeline(*prepro_pipeline, XGBClassifier()),
            ),
            CalibratedClassifierCV(
                make_pipeline(*prepro_pipeline, KNeighborsClassifier()),
            ),
            CalibratedClassifierCV(
                make_pipeline(*prepro_pipeline, SGDClassifier()),
            ),
                     ],
        meta_classifier=XGBClassifier(),
        use_probas=True
    )
    
)


stacked_ft_cv = cross_val_score(stacked_pipeline, feature_matrix, train_target, cv=10)
stacked_ft_cv.mean(), stacked_ft_cv.std()

TypeError: make_pipeline() argument after * must be an iterable, not Pipeline

In [None]:
StackingCVClassifier