# TPOT Test with Titanic Dataset (Kaggle)
https://www.kaggle.com/c/titanic/data

In [1]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
from multiprocessing import cpu_count
import pandas as pd
import numpy as np

In [2]:
def read_dataset(file):
    df = pd.read_csv(file, sep=',')
    cols = ['Pclass','Sex','Age','SibSp','Parch','Embarked', 'PassengerId']
    if 'Survived' in list(df):
        cols.append('Survived')
    df = df[cols]
    df.dropna(inplace = True)
    sex = pd.get_dummies(df['Sex'])
    embarked = pd.get_dummies(df['Embarked'])
    df = pd.concat([df, sex, embarked], axis = 1)
    df.drop(['Sex','female','Embarked','Q'],axis=1,inplace=True)
    df.rename(columns={'Survived': 'target'}, inplace=True)
    df['adult'] = df['Age'].apply(lambda x: 0 if x < 21 else 1)
    return df

In [3]:
df = read_dataset('train.csv')
df.drop(['PassengerId'],axis=1,inplace=True)
df = df.iloc[np.random.permutation(len(df))]
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,target,male,C,S,adult
0,3,32.0,0,0,0,1,0,0,1
1,3,30.0,1,1,0,0,0,1,1
2,3,21.0,0,0,1,0,0,1,1
3,2,4.0,1,1,1,0,0,1,0
4,3,28.0,0,0,0,1,0,1,1


In [4]:
target = df['target'].values

training_indices, validation_indices = training_indices, testing_indices = \
    train_test_split(df.index, stratify = target, test_size=0.25)

training_indices.size, validation_indices.size

(534, 178)

In [5]:
tpot = TPOTClassifier(verbosity=2, max_time_mins=20, n_jobs=cpu_count(), early_stop=10)

tpot.fit(
    df.drop('target',axis=1).loc[training_indices].values, 
    df.loc[training_indices,'target'].values
)

score = tpot.score(
    df.drop('target',axis=1).loc[validation_indices].values,
    df.loc[validation_indices, 'target'].values
)

print(score)

HBox(children=(IntProgress(value=0, description='Optimization Progress', style=ProgressStyle(description_width…

Generation 1 - Current best internal CV score: 0.8240286839475696
Generation 2 - Current best internal CV score: 0.8240286839475696
Generation 3 - Current best internal CV score: 0.8240286839475696
Generation 4 - Current best internal CV score: 0.8240286839475696
Generation 5 - Current best internal CV score: 0.8240286839475696
Generation 6 - Current best internal CV score: 0.8240286839475696
Generation 7 - Current best internal CV score: 0.8276618533539711
Generation 8 - Current best internal CV score: 0.8295486458068012
Generation 9 - Current best internal CV score: 0.8295486458068012
Generation 10 - Current best internal CV score: 0.8295486458068012
Generation 11 - Current best internal CV score: 0.8295486458068012
Generation 12 - Current best internal CV score: 0.8295486458068012
Generation 13 - Current best internal CV score: 0.831453071833957
Generation 14 - Current best internal CV score: 0.831453071833957
Generation 15 - Current best internal CV score: 0.831453071833957
Generat

In [6]:
tpot.export('pipeline.py')

True

---
## Execute TPOT suggested pipeline

In [7]:
with open('pipeline.py', 'r') as f:
    pipeline_contents = f.read()
    pipeline_contents = pipeline_contents.replace('tpot_data = ','# tpot_data = ')
    
print (pipeline_contents)

import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import OneHotEncoder, StackingEstimator
from xgboost import XGBClassifier

# NOTE: Make sure that the class is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:0.831453071833957
exported_pipeline = make_pipeline(
    OneHotEncoder(minimum_fraction=0.25, sparse=False, threshold=10),
    StackingEstimator(estimator=XGBClassifier(learning_rate=0.1, max_depth=3, min_child_weight=6, n_estimat

In [8]:
tpot_data = df
exec(pipeline_contents)

In [9]:
predict = read_dataset('test.csv')
for c in list(predict):
    predict[c] = predict[c].astype(float)
predict.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,PassengerId,male,C,S,adult
0,3.0,34.5,0.0,0.0,892.0,1.0,0.0,0.0,1.0
1,3.0,47.0,1.0,0.0,893.0,0.0,0.0,1.0,1.0
2,2.0,62.0,0.0,0.0,894.0,1.0,0.0,0.0,1.0
3,3.0,27.0,0.0,0.0,895.0,1.0,0.0,1.0,1.0
4,3.0,22.0,1.0,1.0,896.0,0.0,0.0,1.0,1.0


In [10]:
result = exported_pipeline.predict(predict.drop(['PassengerId'],axis=1))
predict['Survived'] = result
predict.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,PassengerId,male,C,S,adult,Survived
0,3.0,34.5,0.0,0.0,892.0,1.0,0.0,0.0,1.0,0
1,3.0,47.0,1.0,0.0,893.0,0.0,0.0,1.0,1.0,0
2,2.0,62.0,0.0,0.0,894.0,1.0,0.0,0.0,1.0,0
3,3.0,27.0,0.0,0.0,895.0,1.0,0.0,1.0,1.0,0
4,3.0,22.0,1.0,1.0,896.0,0.0,0.0,1.0,1.0,0


In [11]:
predict[['PassengerId', 'Survived']].groupby('Survived').count()

Unnamed: 0_level_0,PassengerId
Survived,Unnamed: 1_level_1
0,206
1,126
