In [32]:
import pickle
from mlxtend.classifier import StackingClassifier
import pandas as pd
import featuretools as ft
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline, make_pipeline
from hpsklearn import HyperoptEstimator, any_classifier
import numpy as np
from collections import defaultdict
import seaborn as sns
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [3]:
pickle_files = [
                ('extra_trees', '0.157_100_extra_trees_titanic_dfs_ensemble_estimator.pickle'),
               ('knn', '0.157_100_knn_titanic_dfs_ensemble_estimator.pickle'),
               ('random_forest', '0.163_100_random_forest_titanic_dfs_ensemble_estimator.pickle'),
               ('sgd', '0.163_100_sgd_titanic_dfs_ensemble_estimator.pickle'),
               ('xgboost_classification', '0.163_100_xgboost_classification_titanic_dfs_ensemble_estimator.pickle'),
                ('ada_boost', '0.157_100_ada_boost_titanic_dfs_ensemble_estimator.pickle')
               ]

In [9]:
train_df = pd.read_csv("data/train.csv")
train_df["Pclass"] = train_df["Pclass"].astype("category")
train_df['CabinClass'] = train_df["Cabin"].str.get(0)
train_df['LastName'] = train_df['Name'].str.split(", ").apply(lambda x: x[0])
train_df['Honorific'] = train_df['Name'].str.extract(" ([a-zA-z]+)")
train_df['TicketPrefix'] = train_df['Ticket'].str.extract("(.+) ")
train_df['TicketNumber'] = train_df['Ticket'].str.extract("([0-9]+)$").astype(float)
train_df = train_df.drop(columns=["Name", "Cabin", "Ticket"])
train_features = train_df.drop(columns=["Survived"])
train_target = train_df["Survived"]
es = ft.EntitySet(id="titanic")
es.entity_from_dataframe(entity_id="titanic",
                         dataframe=train_features,
                         index="PassengerId",
                         variable_types={
                             "Pclass": ft.variable_types.Categorical,
                             "Sex": ft.variable_types.Categorical,
                             "Embarked": ft.variable_types.Categorical,
                             "CabinClass": ft.variable_types.Categorical,
                             "LastName": ft.variable_types.Categorical,
                             "Honorific": ft.variable_types.Categorical,
                             "TicketPrefix": ft.variable_types.Categorical
                           })
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="classes",
                index="Pclass")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="sexes",
                index="Sex")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="embarkeds",
                index="Embarked")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="cabinclasses",
                index="CabinClass")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="honorfics",
                index="Honorific")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="lastnames",
                index="LastName")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="ticketprefixes",
                index="TicketPrefix")
feature_matrix, features = ft.dfs(entityset=es,
                              target_entity="titanic",
                              save_progress="results/",
                              verbose=True)

feature_matrix, features = ft.encode_features(feature_matrix, features)
imputer = Imputer(strategy='median')
imputed_matrix = imputer.fit_transform(feature_matrix)

  """
  
  import sys


Built 313 features
Elapsed: 00:18 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 1/1 cutoff times


In [21]:
calibrated_estimators = []
for name, pickle_file in pickle_files:
    with open(f"results/models/{pickle_file}", "rb") as model_file:
        hyperopt_model = pickle.load(model_file)
        model = hyperopt_model.best_model()['learner']
        prepro = hyperopt_model.best_model()['preprocs']
        pipeline = make_pipeline(*prepro, model)
        calibrator = CalibratedClassifierCV(pipeline, cv=5, method="isotonic")
        calibrator.fit(imputed_matrix, train_target)
        calibrated_estimators.append((name, calibrator))

In [23]:
classifiers = [x[1] for x in calibrated_estimators]

In [27]:
stack = StackingClassifier(classifiers=classifiers, 
                   meta_classifier=XGBClassifier(),
                   use_probas=True,
                   average_probas=False)

In [30]:
cross_val_score(stack, imputed_matrix, train_target)

  if diff:
  if diff:
  if diff:


array([0.78451178, 0.86531987, 0.84848485])

In [31]:
stack.fit(imputed_matrix, train_target)

StackingClassifier(average_probas=False,
          classifiers=[CalibratedClassifierCV(base_estimator=Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(-1.0, 1.0))), ('gradientboostingclassifier', GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.005168273904849856, ...ndom_state=2,
            verbose=False, warm_start=False))]),
            cv=5, method='isotonic')],
          meta_classifier=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          refit=True, store_train_meta_features=False,
          use_features_in_secondary=False, use_probas=True, verbose=0)

In [34]:
test_df = pd.read_csv("data/test.csv")
test_df["Pclass"] = test_df["Pclass"].astype("category")
test_df['CabinClass'] = test_df["Cabin"].str.get(0)
test_df['LastName'] = test_df['Name'].str.split(", ").apply(lambda x: x[0])
test_df['Honorific'] = test_df['Name'].str.extract(" ([a-zA-z]+)")
test_df['TicketPrefix'] = test_df['Ticket'].str.extract("(.+) ")
test_df['TicketNumber'] = test_df['Ticket'].str.extract("([0-9]+)$").astype(float)
test_df = test_df.drop(columns=["Name", "Cabin", "Ticket"])
es = ft.EntitySet(id="titanic")
es.entity_from_dataframe(entity_id="titanic",
                         dataframe=test_df,
                         index="PassengerId",
                         variable_types={
                             "Pclass": ft.variable_types.Categorical,
                             "Sex": ft.variable_types.Categorical,
                             "Embarked": ft.variable_types.Categorical})
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="classes",
                index="Pclass")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="sexes",
                index="Sex")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="embarkeds",
                index="Embarked")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="cabinclasses",
                index="CabinClass")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="honorfics",
                index="Honorific")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="lastnames",
                index="LastName")
es.normalize_entity(base_entity_id="titanic",
                new_entity_id="ticketprefixes",
                index="TicketPrefix")

test_features = ft.load_features("results/encoded_features.pkl", es)
test_matrix = ft.calculate_feature_matrix(test_features)
test_matrix = imputer.transform(test_matrix)

  """
  
  import sys


In [36]:
submission_df = pd.DataFrame({'PassengerId': test_df['PassengerId'],
              'Survived': stack.predict(test_matrix)})
submission_df.to_csv("results/ft_hpsklearn_submission_stacked.csv", index=False)

  if diff:
