In [3]:
import pickle
from mlxtend.classifier import StackingClassifier
import pandas as pd
import featuretools as ft
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from hpsklearn import HyperoptEstimator, any_classifier
import numpy as np
from collections import defaultdict
import seaborn as sns
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin

%matplotlib inline

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


In [1]:
pickle_files = ['0.162_100_ada_boost_titanic_dfs_ensemble_pipeline_cv10_estimator.pickle',
                '0.158_100_extra_trees_titanic_dfs_ensemble_pipeline_cv10_estimator.pickle',
                '0.155_100_xgboost_classification_titanic_dfs_ensemble_pipeline_cv10_estimator.pickle',
                '0.148_100_random_forest_titanic_dfs_ensemble_pipeline_cv10_estimator.pickle',
                '0.172_100_knn_titanic_dfs_ensemble_pipeline_cv10_estimator.pickle',
                '0.18_100_sgd_titanic_dfs_ensemble_pipeline_cv10_estimator.pickle'
               ]

In [6]:
class FeaturetoolsTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, categorical_features, index, filepath, warm_start):
        self.categorical_features = categorical_features
        self.index = index
        self.filepath = filepath
        self.warm_start = warm_start
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        variable_types = {variable: ft.variable_types.Categorical for variable in self.categorical_features}
        es = ft.EntitySet(id="id")
        es.entity_from_dataframe(entity_id="id",
                                 dataframe=X,
                                 index=self.index,
                                 variable_types=variable_types)
        for variable in self.categorical_features:
            es.normalize_entity(base_entity_id="id",
                            new_entity_id=variable,
                            index=variable)
        self.es = es
        if self.warm_start:
            self.features = ft.load_features(self.filepath, self.es)
            matrix = ft.calculate_feature_matrix(self.features)
            return matrix
        else:
            matrix, self.features = ft.dfs(entityset=self.es,
                                      target_entity="id",
                                      save_progress="results/",
                                      verbose=False)
            
            matrix, self.features = ft.encode_features(matrix, self.features)
            self.warm_start = True
            ft.save_features(self.features, self.filepath)
            return matrix

In [7]:
with open(f"results/models/0.148_100_random_forest_titanic_dfs_ensemble_pipeline_cv10_estimator.pickle", "rb") as model_file:
    hyperopt_model = pickle.load(model_file)
    model = hyperopt_model.best_model()['learner']
    prepro = hyperopt_model.best_model()['preprocs']

pipeline = make_pipeline(FeaturetoolsTransformer(
        categorical_features=["Pclass", "Sex", "Embarked", "CabinClass", 
                          "LastName", "Honorific", "TicketPrefix"],
    index = "PassengerId",
    filepath="results/sklearn_encoded_features.pkl",
    warm_start=True), Imputer(strategy='median'),
                         StandardScaler(),
                         *prepro, model)

In [8]:
train_df = pd.read_csv("data/train.csv")
train_df["Pclass"] = train_df["Pclass"].astype("category")
train_df['CabinClass'] = train_df["Cabin"].str.get(0)
train_df['LastName'] = train_df['Name'].str.split(", ").apply(lambda x: x[0])
train_df['Honorific'] = train_df['Name'].str.extract(" ([a-zA-z]+)")
train_df['TicketPrefix'] = train_df['Ticket'].str.extract("(.+) ")
train_df['TicketNumber'] = train_df['Ticket'].str.extract("([0-9]+)$").astype(float)
train_df = train_df.drop(columns=["Name", "Cabin", "Ticket"])
train_features = train_df.drop(columns=["Survived"])
train_target = train_df["Survived"]
pipeline.fit(train_features, train_target)

  """
  
  import sys


Pipeline(memory=None,
     steps=[('featuretoolstransformer', FeaturetoolsTransformer(categorical_features=['Pclass', 'Sex', 'Embarked', 'CabinClass', 'LastName', 'Honorific', 'TicketPrefix'],
            filepath='results/sklearn_encoded_features.pkl',
            index='PassengerId', warm_start=True)), ('imputer', Imputer(a...mators=22, n_jobs=1, oob_score=False, random_state=3,
            verbose=False, warm_start=False))])

In [16]:
prepro_pipeline = make_pipeline(FeaturetoolsTransformer(
        categorical_features=["Pclass", "Sex", "Embarked", "CabinClass", 
                          "LastName", "Honorific", "TicketPrefix"],
        index = "PassengerId",
        filepath="results/sklearn_encoded_features2.pkl",
        warm_start=False), Imputer(strategy='median'),
                             StandardScaler())
prepro_features = prepro_pipeline.fit_transform(train_features)

In [21]:
from tqdm import tnrange, tqdm

In [22]:
calibrated_estimators = []
for pickle_file in tqdm(pickle_files, total=len(pickle_files)):
    with open(f"results/models/{pickle_file}", "rb") as model_file:
        hyperopt_model = pickle.load(model_file)
        model = hyperopt_model.best_model()['learner']
        prepro = hyperopt_model.best_model()['preprocs']
        pipeline =  make_pipeline(*prepro, model)
        calibrator = CalibratedClassifierCV(model, cv=5, method="isotonic")
        calibrator.fit(prepro_features, train_target)
        calibrated_estimators.append(calibrator)

100%|██████████| 6/6 [10:54<00:00, 109.13s/it]


In [24]:
stack = StackingClassifier(classifiers=calibrated_estimators, 
                   meta_classifier=XGBClassifier(),
                   use_probas=True,
                   average_probas=False)

In [25]:
cross_val_score(stack, prepro_features, train_target)

  if diff:
  if diff:
  if diff:


array([0.8047138 , 0.85521886, 0.8047138 ])

In [26]:
stack.fit(prepro_features, train_target)



StackingClassifier(average_probas=False,
          classifiers=[CalibratedClassifierCV(base_estimator=AdaBoostClassifier(algorithm='SAMME', base_estimator=None,
          learning_rate=0.2969585196792592, n_estimators=866,
          random_state=1),
            cv=5, method='isotonic'), CalibratedClassifierCV(base_estimator=ExtraTreesClassifier(boot...     shuffle=True, tol=None, verbose=False, warm_start=False),
            cv=5, method='isotonic')],
          meta_classifier=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          refit=True, store_train_meta_features=False,
          use_features_in_secondary=False, use_probas=True, verbose=0)

In [27]:
test_df = pd.read_csv("data/test.csv")
test_df["Pclass"] = test_df["Pclass"].astype("category")
test_df['CabinClass'] = test_df["Cabin"].str.get(0)
test_df['LastName'] = test_df['Name'].str.split(", ").apply(lambda x: x[0])
test_df['Honorific'] = test_df['Name'].str.extract(" ([a-zA-z]+)")
test_df['TicketPrefix'] = test_df['Ticket'].str.extract("(.+) ")
test_df['TicketNumber'] = test_df['Ticket'].str.extract("([0-9]+)$").astype(float)
test_df = test_df.drop(columns=["Name", "Cabin", "Ticket"])
test_features = prepro_pipeline.transform(test_df)

  """
  
  import sys


In [28]:
submission_df = pd.DataFrame({'PassengerId': test_df['PassengerId'],
              'Survived': stack.predict(test_features)})
submission_df.to_csv("results/ft_hpsklearn_submission_stacked_pipeline.csv", index=False)

  if diff:


In [67]:
with open(f"results/models/0.0_100_sgd_titanic_dfs_ensemble_pipeline_estimator.pickle", "rb") as model_file:
    hyperopt_model = pickle.load(model_file)
    model = hyperopt_model.best_model()['learner']
    prepro = hyperopt_model.best_model()['preprocs']
    pipeline = make_pipeline(*prepro, model)
    featuretools = FeaturetoolsTransformer(
        categorical_features=["Pclass", "Sex", "Embarked", "CabinClass", 
                          "LastName", "Honorific", "TicketPrefix"],
    index = "PassengerId",
    filepath="results/sklearn_encoded_features_sgd.pkl",
    warm_start=False)
    pipeline = make_pipeline(featuretools, Imputer(strategy='median'), StandardScaler(), *prepro, model)

In [62]:
class FeaturetoolsTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, categorical_features, index, filepath, warm_start):
        self.categorical_features = categorical_features
        self.index = index
        self.filepath = filepath
        self._warm_start = warm_start
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        variable_types = {variable: ft.variable_types.Categorical for variable in self.categorical_features}
        es = ft.EntitySet(id="id")
        es.entity_from_dataframe(entity_id="id",
                                 dataframe=X,
                                 index=self.index,
                                 variable_types=variable_types)
        for variable in self.categorical_features:
            es.normalize_entity(base_entity_id="id",
                            new_entity_id=variable,
                            index=variable)
        self.es = es
        if self._warm_start:
            self.features = ft.load_features(self.filepath, self.es)
            matrix = ft.calculate_feature_matrix(self.features)
            return matrix
        else:
            matrix, self.features = ft.dfs(entityset=self.es,
                                      target_entity="id",
                                      save_progress="results/",
                                      verbose=False)
            
            matrix, self.features = ft.encode_features(matrix, self.features)
            self._warm_start = True
            ft.save_features(self.features, self.filepath)
            return matrix

In [68]:
pipeline.fit(train_features, train_target)



Pipeline(memory=None,
     steps=[('featuretoolstransformer', FeaturetoolsTransformer(categorical_features=['Pclass', 'Sex', 'Embarked', 'CabinClass', 'LastName', 'Honorific', 'TicketPrefix'],
            filepath='results/sklearn_encoded_features_sgd.pkl',
            index='PassengerId', warm_start=None)), ('imputer', Imput....6816036005988287, random_state=2, shuffle=True, tol=None,
       verbose=False, warm_start=False))])

In [58]:
pipeline.named_steps['featuretoolstransformer'].warm_start = True

In [69]:
submission_df = pd.DataFrame({'PassengerId': test_df['PassengerId'],
              'Survived': pipeline.predict(test_df)})
submission_df.to_csv("results/overfitting_sgd.csv", index=False)

In [73]:
pipeline.named_steps['sgdclassifier'].coef_

array([[-0.15508855, -0.01567737,  0.        ,  0.14310383,  0.        ,
        -0.00723776,  0.        ,  0.        ,  0.        , -0.03647884,
         0.03647884,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        , -0.0181046 ,  0.        ,  0.        ,
         0.        , -0.01299327, -0.13416503, -0.13416503,  0.01383824,
         0.        ,  0.01383824, -0.18036973,  0.        , -0.0181046 ,
         0.14817187,  0.        ,  0.        ,  0.0121081 ,  0.04367097,
        -0.01299327,  0.        ,  0.        , -0.0733633 , -0.0733633 ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.07720188,
         0.        , -0.04358718,  0.        ,  0.        ,  0.        ,
        -0.01299327,  0.        ,  0.        ,  0.        , -0.00400306,
        -0.00468909,  0.        , -0.00922274,  0.01196244, -0.00668621,
        -0.00655651,  0.        , -0.01010861,  0. 