# Automatic Feature Engineering with Featuretools
by Matthew Emery

In [1]:
import pandas as pd
from xgboost import XGBClassifier, plot_importance
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score, train_test_split
import featuretools as ft
from random import sample
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import make_scorer, confusion_matrix, matthews_corrcoef
import warnings
from itertools import chain
import seaborn as sns
import numpy as np
from sklearn.linear_model import LogisticRegressionCV
from IPython.core.debugger import set_trace
from tpot import TPOTClassifier, config
%matplotlib inline

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("data/adult.csv")
target = df["income"]
features = df.drop(columns="income")
features = pd.get_dummies(features)

train_features, test_features, train_target, test_target = train_test_split(features, 
                                                                            target,
                                                                            train_size=0.75,
                                                                            stratify=target,
                                                                            random_state=0)

train_features.sample(5)

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
13576,42,31621,12,0,0,40,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
22008,82,147729,3,0,0,20,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
17320,35,49749,10,0,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
26822,43,167265,13,0,0,84,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
8976,51,254211,13,0,0,20,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [3]:
target.value_counts()

<=50K    24720
>50K      7841
Name: income, dtype: int64

In [4]:
def matthews_cross_val(model, features):
    return cross_val_score(model, 
                           features,
                           train_target, 
                           cv=5,
                           scoring=make_scorer(matthews_corrcoef))

In [5]:
dummy_model = DummyClassifier(random_state=0)
dummy_cv = cross_val_score(dummy_model, train_features, train_target, cv=5, scoring=make_scorer(matthews_corrcoef))
dummy_cv.mean(), dummy_cv.std()

(-0.012706797451711289, 0.008676973964245771)

In [6]:
basic_model = XGBClassifier(random_state=0)
basic_cv = matthews_cross_val(basic_model, 
                              pd.get_dummies(train_features))
basic_cv.mean(), basic_cv.std()

(0.6012086258922207, 0.011954271745508522)

In [10]:
tpot = TPOTClassifier(periodic_checkpoint_folder="results/checkpoints-census", 
                      cv=5, 
                      verbosity=2, 
                      n_jobs=-1,
                      generations=10,
                      population_size=10,
                      scoring=make_scorer(matthews_corrcoef))
tpot.fit(train_features, target=train_target)

Optimization Progress:   1%|          | 1/110 [03:30<6:22:43, 210.67s/pipeline]Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f2b11449550>>
Traceback (most recent call last):
  File "/home/deadhead/miniconda3/envs/automl-talk/lib/python3.6/site-packages/xgboost/core.py", line 368, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'
Optimization Progress:  19%|█▉        | 21/110 [08:18<2:42:54, 109.83s/pipeline]

Generation 1 - Current best internal CV score: 0.618225742630993


Optimization Progress:  28%|██▊       | 31/110 [12:20<1:43:00, 78.23s/pipeline] 

Generation 2 - Current best internal CV score: 0.618225742630993


Optimization Progress:  38%|███▊      | 42/110 [20:26<1:47:07, 94.52s/pipeline] 

Generation 3 - Current best internal CV score: 0.6191224116987836


Optimization Progress:  47%|████▋     | 52/110 [31:27<2:09:58, 134.46s/pipeline]

Generation 4 - Current best internal CV score: 0.6211458512572665


Optimization Progress:  57%|█████▋    | 63/110 [39:13<1:23:59, 107.23s/pipeline]

Generation 5 - Current best internal CV score: 0.628320874918898


Optimization Progress:  67%|██████▋   | 74/110 [47:30<1:26:20, 143.90s/pipeline]

Generation 6 - Current best internal CV score: 0.628320874918898


Optimization Progress:  76%|███████▋  | 84/110 [52:31<42:19, 97.67s/pipeline]   

Generation 7 - Current best internal CV score: 0.628320874918898


Optimization Progress:  85%|████████▌ | 94/110 [58:29<21:50, 81.88s/pipeline] 

Generation 8 - Current best internal CV score: 0.628320874918898


Optimization Progress:  95%|█████████▍| 104/110 [1:00:17<04:10, 41.67s/pipeline]

Generation 9 - Current best internal CV score: 0.628320874918898


                                                                                

Generation 10 - Current best internal CV score: 0.628320874918898

Best pipeline: GradientBoostingClassifier(input_matrix, learning_rate=0.1, max_depth=9, max_features=0.25, min_samples_leaf=7, min_samples_split=13, n_estimators=100, subsample=0.9000000000000001)


TPOTClassifier(config_dict={'sklearn.naive_bayes.GaussianNB': {}, 'sklearn.naive_bayes.BernoulliNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.naive_bayes.MultinomialNB': {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], 'fit_prior': [True, False]}, 'sklearn.tree.DecisionT....3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ])}}}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=10, max_eval_time_mins=5,
        max_time_mins=None, memory=None, mutation_rate=0.9, n_jobs=4,
        offspring_size=10,
        periodic_checkpoint_folder='results/checkpoints-census',
        population_size=10, random_state=None, scoring=None, subsample=1.0,
        verbosity=2, warm_start=False)