In [67]:
import pickle
from sklearn.calibration import CalibratedClassifierCV
import featuretools as ft
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer, StandardScaler
from xgboost import XGBClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score

In [2]:
with open("results/models/0.157_100_kitchen-sink_titanic_dfs_fe_estimator.pickle", "rb") as filename:
    estim = pickle.load(filename)

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


In [3]:
estim.best_model()

{'ex_preprocs': (),
 'learner': GradientBoostingClassifier(criterion='friedman_mse', init=None,
               learning_rate=0.005168273904849856, loss='deviance',
               max_depth=None, max_features=0.11598886784560192,
               max_leaf_nodes=None, min_impurity_decrease=0.0,
               min_impurity_split=None, min_samples_leaf=1,
               min_samples_split=2, min_weight_fraction_leaf=0.0,
               n_estimators=156, presort='auto', random_state=4,
               subsample=0.6081704543206994, verbose=0, warm_start=False),
 'preprocs': (MinMaxScaler(copy=True, feature_range=(-1.0, 1.0)),)}

In [23]:
train_df = pd.read_csv("data/train.csv")
train_df["Pclass"] = train_df["Pclass"].astype("category")
train_df['CabinClass'] = train_df["Cabin"].str.get(0)
train_df['LastName'] = train_df['Name'].str.split(", ").apply(lambda x: x[0])
train_df['Honorific'] = train_df['Name'].str.extract(" ([a-zA-z]+)")
train_df['TicketPrefix'] = train_df['Ticket'].str.extract("(.+) ")
train_df['TicketNumber'] = train_df['Ticket'].str.extract("([0-9]+)$").astype(float)
train_df = train_df.drop(columns=["Name", "Cabin", "Ticket"])
train_features = train_df.drop(columns=["Survived"])
train_target = train_df["Survived"]

  """
  
  import sys


In [86]:
train_target.mean()

0.3838383838383838

In [24]:
es = ft.EntitySet(id="titanic")
es.entity_from_dataframe(entity_id="titanic",
                         dataframe=train_features,
                         index="PassengerId",
                         variable_types={
                             "Pclass": ft.variable_types.Categorical,
                             "Sex": ft.variable_types.Categorical,
                             "Embarked": ft.variable_types.Categorical,
                             "CabinClass": ft.variable_types.Categorical,
                             "LastName": ft.variable_types.Categorical,
                             "Honorific": ft.variable_types.Categorical,
                             "TicketPrefix": ft.variable_types.Categorical
                         })
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="classes",
                index="Pclass")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="sexes",
                index="Sex")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="embarkeds",
                index="Embarked")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="cabinclasses",
                index="CabinClass")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="honorfics",
                index="Honorific")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="lastnames",
                index="LastName")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="ticketprefixes",
                index="TicketPrefix")

features = ft.load_features("results/encoded_features.pkl", es)
train_matrix = ft.calculate_feature_matrix(features)

In [84]:
pipeline = Pipeline([
    ('featuretools', FeaturetoolsTransformer(
        categorical_features=["Pclass", "Sex", "Embarked", "CabinClass", 
                          "LastName", "Honorific", "TicketPrefix"],
    index = "PassengerId",
    filepath="results/sklearn_encoded_features.pkl",
    warm_start=False)),
    ('imputer', Imputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', XGBClassifier())])

In [85]:
cross_val_score(pipeline, train_features, train_target, cv=3)

  if diff:
  if diff:
  if diff:


array([0.81144781, 0.81481481, 0.85521886])

In [65]:
calibrator = CalibratedClassifierCV(pipeline, cv=5, method="isotonic")

In [32]:
calibrator.fit(train_features, train_target)

CalibratedClassifierCV(base_estimator=Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('preprocessor', MinMaxScaler(copy=True, feature_range=(-1.0, 1.0))), ('model', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.005168273904849856, loss='...t='auto', random_state=4,
              subsample=0.6081704543206994, verbose=0, warm_start=False))]),
            cv=5, method='isotonic')

In [83]:
class FeaturetoolsTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, categorical_features, index, filepath, warm_start):
        self.categorical_features = categorical_features
        self.index = index
        self.filepath = filepath
        self.warm_start = warm_start
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        variable_types = {variable: ft.variable_types.Categorical for variable in self.categorical_features}
        es = ft.EntitySet(id="id")
        es.entity_from_dataframe(entity_id="id",
                                 dataframe=X,
                                 index=self.index,
                                 variable_types=variable_types)
        for variable in self.categorical_features:
            es.normalize_entity(base_entity_id="id",
                            new_entity_id=variable,
                            index=variable)
        self.es = es
        if self.warm_start:
            self.features = ft.load_features(self.filepath, self.es)
            matrix = ft.calculate_feature_matrix(self.features)
            return matrix
        else:
            matrix, self.features = ft.dfs(entityset=self.es,
                                      target_entity="id",
                                      save_progress="results/",
                                      verbose=False)
            
            matrix, self.features = ft.encode_features(matrix, self.features)
            self.warm_start = True
            ft.save_features(self.features, self.filepath)
            return matrix

In [58]:
featuretools = FeaturetoolsTransformer(
    categorical_features=["Pclass", "Sex", "Embarked", "CabinClass", 
                          "LastName", "Honorific", "TicketPrefix"],
    index = "PassengerId",
    filepath="results/sklearn_encoded_features.pkl",
    warm_start=False
)

In [59]:
featuretools.fit_transform(train_features)

Built 313 features
Elapsed: 00:07 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 1/1 cutoff times


Unnamed: 0_level_0,Age,SibSp,Parch,Fare,TicketNumber,Pclass = 3,Pclass = 1,Pclass = 2,Pclass = unknown,Sex = male,...,TicketPrefix.MODE(id.LastName) = Corn,TicketPrefix.MODE(id.LastName) = Hakkarainen,TicketPrefix.MODE(id.LastName) = unknown,TicketPrefix.MODE(id.Honorific) = Mr,TicketPrefix.MODE(id.Honorific) = Miss,TicketPrefix.MODE(id.Honorific) = Master,TicketPrefix.MODE(id.Honorific) = Capt,TicketPrefix.MODE(id.Honorific) = Rev,TicketPrefix.MODE(id.Honorific) = Mrs,TicketPrefix.MODE(id.Honorific) = unknown
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,22.0,1,0,7.2500,21171.0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
2,38.0,1,0,71.2833,17599.0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,26.0,0,0,7.9250,3101282.0,1,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
4,35.0,1,0,53.1000,113803.0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
5,35.0,0,0,8.0500,373450.0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
6,,0,0,8.4583,330877.0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
7,54.0,0,0,51.8625,17463.0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,1
8,2.0,3,1,21.0750,349909.0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
9,27.0,0,2,11.1333,347742.0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
10,14.0,1,0,30.0708,237736.0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
