In [2]:
try:
    import tpot 
    print('Package already installed')
except ImportError:
    print('Installing package...')
    !conda install -c conda-forge tpot -y
    print('Package installed!')

Installing package...
Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/mcampos/opt/anaconda3

  added / updated specs:
    - tpot


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    deap-1.3.3                 |   py39h0082581_0         153 KB  conda-forge
    libcxx-14.0.6              |       hce7ea42_0         1.3 MB  conda-forge
    libxgboost-1.5.0           |       he9d5cce_2         1.2 MB
    py-xgboost-1.5.0           |   py39hecd8cb5_2         154 KB
    stopit-1.1.2               |             py_0          16 KB  conda-forge
    tpot-0.11.7                |     pyhd8ed1ab_1          56 KB  conda-forge
    update_checker-0.18.0      |     pyh9f0ad1d_0          10 KB  conda-forge
    -----------------------------------

In [89]:
from tpot import TPOTClassifier
from sklearn.model_selection import train_test_split
import pandas as pd 
import numpy as np

In [90]:
# Load the data
titanic = pd.read_csv('data/train.csv')
titanic.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [91]:
print('---- [Variable cardinality] ------\n')
for cat in ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']:
    print("   '{0}'\t: {1} ".format(cat, titanic[cat].unique().size))
    
    

---- [Variable cardinality] ------

   'Name'	: 891 
   'Sex'	: 2 
   'Ticket'	: 681 
   'Cabin'	: 148 
   'Embarked'	: 4 


In [92]:
# Simple cleaning
titanic.rename(columns={'Survived': 'class'}, inplace=True)
titanic['Sex'] = titanic['Sex'].map({'male':0,'female':1})
titanic['Embarked'] = titanic['Embarked'].map({'S':0,'C':1,'Q':2})

In [94]:
titanic = titanic.fillna(-999)
pd.isnull(titanic).any()

PassengerId    False
class          False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
dtype: bool

In [96]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
values_fld = [{str(val)} for val in titanic['Cabin'].values]
CabinTrans = mlb.fit_transform(values_fld)

In [98]:
CabinTrans

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [101]:
titanic_new = titanic.drop(['Name','Ticket','Cabin','class'], axis=1)

In [102]:
titanic_new = np.hstack((titanic_new.values, CabinTrans))

In [103]:
titanic_class = titanic['class'].values

In [104]:
training_indices, validation_indices = training_indices, testing_indices = train_test_split(titanic.index, stratify = titanic_class, train_size=0.75, test_size=0.25)
training_indices.size, validation_indices.size



(668, 223)

In [105]:
# %%capture --no-stderr
# Using the command above to disable warnings.
tpot = TPOTClassifier(verbosity=2, max_time_mins=2, max_eval_time_mins=0.04, population_size=40)
tpot.fit(titanic_new[training_indices], titanic_class[training_indices])

Optimization Progress:   0%|          | 0/40 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8009202109751993

Generation 2 - Current best internal CV score: 0.8098529906856694

Generation 3 - Current best internal CV score: 0.8098529906856694

Generation 4 - Current best internal CV score: 0.8248344742453149

Generation 5 - Current best internal CV score: 0.8248344742453149

2.01 minutes have elapsed. TPOT will close down.
TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.

Best pipeline: XGBClassifier(SelectFwe(input_matrix, alpha=0.025), learning_rate=0.5, max_depth=5, min_child_weight=11, n_estimators=100, n_jobs=1, subsample=0.6500000000000001, verbosity=0)


TPOTClassifier(max_eval_time_mins=0.04, max_time_mins=2, population_size=40,
               verbosity=2)

In [107]:
tpot.score(titanic_new[validation_indices], titanic.loc[validation_indices, 'class'].values)

0.8295964125560538

In [108]:
tpot.export('generated_titanic_model.py')

#### Generated python file 

##### Execute new prediction using the code above.

In [110]:
# Read in the submission dataset
titanic_test = pd.read_csv('data/test.csv')
titanic_test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Survived,Unnamed: 12
count,418.0,418.0,332.0,418.0,418.0,417.0,418.0,0.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188,0.421053,
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576,1.017703,
min,892.0,1.0,0.17,0.0,0.0,0.0,0.0,
25%,996.25,1.0,21.0,0.0,0.0,7.8958,0.0,
50%,1100.5,3.0,27.0,0.0,0.0,14.4542,0.0,
75%,1204.75,3.0,39.0,1.0,0.0,31.5,1.0,
max,1309.0,3.0,76.0,8.0,9.0,512.3292,11.0,


In [112]:
for var in ['Cabin']: #,'Name','Ticket']:
    new = list(set(titanic_test[var]) - set(titanic[var]))
    titanic_test.loc[titanic_test[var].isin(new), var] = -999

In [113]:
titanic_test['Sex'] = titanic_test['Sex'].map({'male':0,'female':1})
titanic_test['Embarked'] = titanic_test['Embarked'].map({'S':0,'C':1,'Q':2})
titanic_test = titanic_test.fillna(-999)
pd.isnull(titanic_test).any()

PassengerId    False
Pclass         False
Name           False
Sex            False
Age            False
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin          False
Embarked       False
Survived       False
Unnamed: 12    False
dtype: bool

In [114]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
SubCabinTrans = mlb.fit(
    [{str(val)} for val in titanic['Cabin'].values]).transform(
        [{str(val)} for val in titanic_test['Cabin'].values])
titanic_test = titanic_test.drop(['Name','Ticket','Cabin'], axis=1)

In [115]:
# Form the new submission data set
titanic_sub_new = np.hstack((titanic_test.values,SubCabinTrans))
np.any(np.isnan(titanic_sub_new))

False

In [116]:
# Ensure equal number of features in both the final training and submission dataset
assert (titanic_new.shape[1] == titanic_sub_new.shape[1]), "Not Equal" 

AssertionError: Not Equal

In [117]:
titanic_new.shape[1]

156

In [118]:
titanic_new[1]

array([ 2.    ,  1.    ,  1.    , 38.    ,  1.    ,  0.    , 71.2833,
        1.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  1.    ,
        0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,
        0.    ,  0. 

In [119]:
titanic_sub_new.shape[1]

158

In [69]:
titanic_sub_new[1]

array([ 1.237e+03,  3.000e+00,  1.000e+00,  1.600e+01,  0.000e+00,
        0.000e+00,  7.650e+00,  0.000e+00,  1.000e+00, -9.990e+02,
        1.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
        0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
        0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
        0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
        0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
        0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
        0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
        0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
        0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
        0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
        0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
        0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,
        0.000e+00,  0.000e+00,  0.000e+00,  0.000e+00,  0.000e

In [62]:
# Generate the predictions
submission = tpot.predict(titanic_sub_new)

ValueError: X has 158 features, but ExtraTreesClassifier is expecting 156 features as input.

In [63]:
# Create the submission file
final = pd.DataFrame({'PassengerId': titanic_sub['PassengerId'], 'Survived': submission})
final.to_csv('data/submission.csv', index = False)

NameError: name 'submission' is not defined

In [64]:
final.shape

NameError: name 'final' is not defined