In [7]:
%load_ext autoreload
%autoreload 2

#load packages
import sys
import pandas as pd
import matplotlib
import numpy as np
import scipy as sp
import IPython
from IPython import display
import sklearn
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn_pandas import DataFrameMapper, cross_val_score

import titanic_functions as titfun
import transformer_classes

In [8]:
CAT_ATTRIBS = ['Sex','Embarked','Title']
NUMERICS_ATTRIBS = ['Pclass','Age','SibSp','Parch','Fare']

In [9]:
#Read train and test data
train = pd.read_csv("../input/train.csv", dtype={"Age": np.float64}, )
test = pd.read_csv("../input/test.csv", dtype={"Age": np.float64}, )
# combine into two
combine = [train, test]

In [10]:
my_mapper = DataFrameMapper([
    ('Sex', sklearn.preprocessing.LabelBinarizer()),
    ('Embarked', sklearn.preprocessing.LabelBinarizer()),
    ('Title', sklearn.preprocessing.LabelBinarizer())
    ], input_df=True)

In [11]:
categorical_data_pipeline = Pipeline([
    ('ebarked_imputer', transformer_classes.EmbarkedImputer()),
    ('title_creator', transformer_classes.TitleCreator()),
    ('label_binarizer_df', my_mapper),
])



numerical_data_pipeline = Pipeline([
    ('fare_imputer', transformer_classes.GeneralImputer(col_impute=['Fare'], 
                                                        col_group=['Sex', 'Pclass'], 
                                                        impute_method='median')),
    ('age_imputer', transformer_classes.GeneralImputer(col_impute=['Age'], 
                                                       col_group=['Sex', 'Pclass'], 
                                                       impute_method='average')), # median perhaps?
    ('selector', transformer_classes.DataFrameSelector(NUMERICS_ATTRIBS)),
    ('std_scaler', StandardScaler()),
    
])



full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", numerical_data_pipeline),
    ("cat_pipeline", categorical_data_pipeline),
])    

    

train_prepared = full_pipeline.fit_transform(train)
test_prepared = full_pipeline.transform(test)

print(type(train_prepared))
print(train_prepared.shape)
print(train_prepared)

<class 'numpy.ndarray'>
(891, 14)
[[ 0.82737724 -0.55136635  0.43279337 ...,  1.          0.          0.        ]
 [-1.56610693  0.65402951  0.43279337 ...,  0.          1.          0.        ]
 [ 0.82737724 -0.25001739 -0.4745452  ...,  0.          0.          0.        ]
 ..., 
 [ 0.82737724 -0.57020066  0.43279337 ...,  0.          0.          0.        ]
 [-1.56610693 -0.25001739 -0.4745452  ...,  1.          0.          0.        ]
 [ 0.82737724  0.20200606 -0.4745452  ...,  1.          0.          0.        ]]


In [81]:
from tpot import TPOTClassifier
tpot = TPOTClassifier(verbosity=2, max_time_mins=20)
tpot.fit(train_prepared, train['Survived'])

                                                                                                                       

Generation 1 - Current best internal CV score: 0.8417719849052429


                                                                                                                       

Generation 2 - Current best internal CV score: 0.8417719849052429


                                                                                                                       

Generation 3 - Current best internal CV score: 0.8417719849052429


                                                                                                                       

Generation 4 - Current best internal CV score: 0.8429208305450022


                                                                                                                       

Generation 5 - Current best internal CV score: 0.8429208305450022


                                                                                                                       

Generation 6 - Current best internal CV score: 0.8440317300562065


                                                                                                                       

Generation 7 - Current best internal CV score: 0.8440317300562065


                                                                                                                       

Generation 8 - Current best internal CV score: 0.848570335277097


                                                                                                                       




                                                                                                                       

20.00323805 minutes have elapsed. TPOT will close down.
TPOT closed prematurely. Will use the current best pipeline.


                                                                                                                       


Best pipeline: LinearSVC(RandomForestClassifier(RobustScaler(input_matrix), bootstrap=False, criterion=entropy, max_features=0.8, min_samples_leaf=12, min_samples_split=16, n_estimators=100), C=15.0, dual=True, loss=squared_hinge, penalty=l2, tol=0.0001)


TPOTClassifier(config_dict={'sklearn.preprocessing.Normalizer': {'norm': ['l1', 'l2', 'max']}, 'xgboost.XGBClassifier': {'n_estimators': [100], 'min_child_weight': range(1, 21), 'max_depth': range(1, 11), 'nthread': [1], 'learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0], 'subsample': array([ 0.05,  0.1 ,  0.15,  0.2 ,...l': [1e-05, 0.0001, 0.001, 0.01, 0.1], 'penalty': ['l1', 'l2'], 'loss': ['hinge', 'squared_hinge']}},
        crossover_rate=0.1, cv=5, disable_update_check=False,
        early_stop=None, generations=1000000, max_eval_time_mins=5,
        max_time_mins=20, memory=None, mutation_rate=0.9, n_jobs=1,
        offspring_size=100, periodic_checkpoint_folder=None,
        population_size=100, random_state=None, scoring=None,
        subsample=1.0, verbosity=2, warm_start=False)

In [82]:
tpot.export('tpot_titanic_pipeline_4.py')

True

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestClassifier

# param_grid = [
#     {'bootstrap': [False, True], 
#      'n_estimators': [80, 100, 130], 
#      'max_features': [0.65, 0.7500000000000001],
#      'min_samples_leaf': [10,12], 
#      'min_samples_split': [5] 
#     },
# ]

# random_forest_classifier = RandomForestClassifier()

# grid_search = GridSearchCV(random_forest_classifier, param_grid, cv=5,scoring='neg_mean_squared_error', refit=True)
# grid_search.fit(train_prepared, train['Survived'])

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import RobustScaler
from sklearn.svm import LinearSVC
from tpot.builtins import StackingEstimator

exported_pipeline = make_pipeline(
    RobustScaler(),
    StackingEstimator(estimator=RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.8, min_samples_leaf=12, min_samples_split=16, n_estimators=100)),
    LinearSVC(C=15.0, dual=True, loss="squared_hinge", penalty="l2", tol=0.0001)
)
# RobustScaler / StackingEstimator / LinearSVC

In [20]:
exported_pipeline.fit(train_prepared, train['Survived'])
final_predictions = exported_pipeline.predict(test_prepared)

In [58]:
# grid_search.best_params_
# final_model = grid_search.best_estimator_

{'bootstrap': True,
 'max_features': 0.7500000000000001,
 'min_samples_leaf': 12,
 'min_samples_split': 5,
 'n_estimators': 100}

In [70]:
# final_predictions = final_model.predict(test_prepared)
# final_predictions

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0,

In [21]:
my_submission = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': final_predictions})
my_submission.to_csv('20180220b_submission.csv', index=False)