# Import libraries

In [1]:
import numpy as np 
import pandas as pd 

from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

import re
import operator
import math
import random
random.seed(42)

import warnings

from deap import gp
from deap import algorithms
from deap import base
from deap import creator
from deap import tools

## Ignore pandas warnings

In [2]:
warnings.filterwarnings("ignore")

# Creating a Genetic Programming program

## Creating the primitives set

One of the most crucial aspect of a GP program is the choice of the primitives set. They should make good building blocks for the individuals so the evolution can succeed. In this problem, we use which are basic arithmetic functions and define dividing, power of number, sqrt, and absolute of a number to prevent some errors.

The number following the function is the arity of the primitive, that is the number of entries it takes.

In [3]:
def prim_set(names):
    # Define new functions
    def pDiv(left, right):
        try:
            return left / right
        except ZeroDivisionError:
            return 1

    def pPow(left, right):
        try:
            return abs(left) ** min(float(right),8)
        except ZeroDivisionError:
            return 1
        except OverflowError:
            return 1
    
    def pSqrt(inp):
        return math.sqrt(abs(inp))
    
    def abs_(inp):
        return abs(inp)
    
    pset = gp.PrimitiveSet("MAIN", len(names))
    pset.addPrimitive(pDiv, 2)
    pset.addPrimitive(pPow, 2)
    pset.addPrimitive(pSqrt, 1)
    pset.addPrimitive(abs_, 1)
    pset.addPrimitive(operator.add, 2)
    pset.addPrimitive(operator.sub, 2)
    pset.addPrimitive(operator.mul, 2)
    pset.addPrimitive(operator.neg, 1)
    pset.addPrimitive(math.floor, 1)
    pset.addPrimitive(math.tanh, 1)
    pset.addPrimitive(math.sin, 1)
    pset.addPrimitive(math.cos, 1)
    pset.addPrimitive(max, 2)
    pset.addPrimitive(min, 2)
    
    # Rename arguments with columns names
    for i, a in enumerate(pset.arguments):
        new_name = names[i]
        pset.arguments[i] = new_name
        pset.mapping[new_name] = pset.mapping[a]
        pset.mapping[new_name].value = new_name
        del pset.mapping[a]
    return pset

Here, as we have a multy dimension regression problem, there are a lot of inputs, but it could have as many as you want. By default, those inputs are named “ARGx”, where “x” is a number, but we renamed them.

## Genetic Algorithm

Prepare inputs and outputs, initializing Deap global variables to use eaSimple(simplest evolutionary algorithm):
- Create an individual containing the genotype and a fitness
- Create toolbox to register some parameters specific to the evolution process

First, a toolbox instance is created (in some problem types like coevolution, you may consider creating more than one toolbox). Then, we can register any parameters. The first lines register how to create an individual (by calling gp.genHalfAndHalf with the previously defined primitive set), and how to create the population (by repeating the individual initialization).

- We may now introduce the evaluation function

Which will receive an individual as input, and return the corresponding fitness. This function uses the compile function previously defined to transform the individual into its executable form – that is, a program. After that, the evaluation is only simple maths, where the difference between the values produced by the evaluated individual and the real values are squared and summed to compute the MSE (Mean Squared Error), which is returned as the fitness of the individual.

- Afterwards, we register the evaluation function. We also choose the selection method (a tournament of size 5), the mate method (one point crossover with uniform probability over all the nodes), and the mutation method (a uniform probability mutation which may append a new full sub-tree to a node).

- Then, we decorate the mate and mutate method to limit the height of generated individuals. 

This is done to avoid an important draw back of genetic programming : bloat. Koza in his book on genetic programming suggest to use a max depth of 17.

- Сreating the population

The hall of fame is a specific structure which contains the n best individuals.

- Statistics are often useful in evolutionary programming

DEAP offers a simple class which can handle most of the “boring work”. In this case, we want to compute the mean and maximum of both the individuals fitness and size. For that we’ll use a MultiStatistics object.

- Calling a complete algorithm. In this case, we’ll use eaSimple()


In [4]:
def gen_alg(mungedtrain, target, seed=42, mxvl=7, ngen=125, pop=200):

    inputs = mungedtrain.values.tolist()
    outputs = target.values.tolist()

    pset = prim_set(list(mungedtrain.columns))
    
    creator.create("FitnessMin", base.Fitness, weights=(1.0,)) # Create fitness object
    creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin) # Create individual object
    
    toolbox = base.Toolbox()
    toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=3)
    toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("compile", gp.compile, pset=pset)
    
    def evalSymbReg(individual):
        # Transform the tree expression in a callable function
        func = toolbox.compile(expr=individual)
        # Evaluate the accuracy
        return sum(round(1.-(1./(1.+np.exp(-func(*in_))))) == out for in_,
                   out in zip(inputs, outputs))/len(mungedtrain),
    
    toolbox.register("evaluate", evalSymbReg)
    toolbox.register("select", tools.selTournament, tournsize=5)
    toolbox.register("mate", gp.cxOnePoint)
    toolbox.register("expr_mut", gp.genFull, min_=0, max_=3)
    toolbox.register("mutate", gp.mutUniform, expr=toolbox. expr_mut, pset=pset)
    
    toolbox.decorate("mate", gp.staticLimit(key=operator.attrgetter("height"), max_value=mxvl))
    toolbox.decorate("mutate", gp.staticLimit(key=operator.attrgetter("height"), max_value=mxvl))
   
    random.seed(seed)

    pop = toolbox.population(n=pop)
    hof = tools.HallOfFame(1)
    
    stats_fit = tools.Statistics(lambda ind: ind.fitness.values)
    stats_size = tools.Statistics(len)
    mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size)
    mstats.register("avg", np.mean)
    mstats.register("max", np.max)
    
    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.65, mutpb=0.35, ngen=ngen,
                                   stats=mstats, halloffame=hof, verbose=True)

    print(hof[0])
    print(hof[0].fitness.values)
    return hof[0], toolbox

## Defining function for outputs

In [5]:
def Outputs(data):
    return np.round(1.-(1./(1.+np.exp(-data))))

# Data preparation

Define function for geting title from passenger name and function for data preparation

In [6]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
            return title_search.group(1)
    return ""

def prep_data(data):
    data['IsNull'] = data.isnull().sum(axis=1)
    data['Ticket'] = data['Ticket'].str.lower().replace('\W', '')
    # Sex
    data.Sex.fillna(0, inplace=True)
    data.loc[data.Sex != 'male', 'Sex'] = 1
    data.loc[data.Sex == 'male', 'Sex'] = 0
    data['NameLen'] = data['Name'].apply(len)
    bin_num = 4
    data['NameLen'] = pd.qcut(data['NameLen'], bin_num,labels=list(range(bin_num))).astype(float)   
    # Feature that tells whether a passenger had a cabin on the Titanic
    data['Has_Cabin'] = data["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
    # Create new feature FamilySize as a combination of SibSp and Parch
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    # Create new feature IsAlone from FamilySize
    data['isFamily'] = 1
    data.loc[data['isFamily'] == 1, 'notAlone'] = 0
    # Create a new feature Title, containing the titles of passenger names
    data['Title'] = data['Name'].apply(get_title)
    # Group all non-common titles into one single grouping "Rare"
    mapping = {'Mlle': 'Rare', 'Major': 'Mr', 'Col': 'Mr', 'Sir': 'Rare', 'Rev': 'Mr',
               'Don': 'Mr', 'Mme': 'Rare', 'Jonkheer': 'Mr', 'Lady': 'Mrs',
               'Capt': 'Mr', 'Countess': 'Rare', 'Ms': 'Miss', 'Dona': 'Rare'}
    data.replace({'Title': mapping}, inplace=True)
    # Mapping titles
    title_mapping = {"Mr": 0, "Miss": 1, "Mrs": 2, "Master": 3, "Rare": 4}
    data['Title'] = data['Title'].map(title_mapping)
    data['Title'] = data['Title'].fillna(0)
    # Remove all NULLS in the Embarked column
    data['Embarked'].fillna(method='backfill', inplace=True)
    # Mapping Embarked
    data['Embarked'] = data['Embarked'].map( {'C': 1, 'Q': 2, 'S': 0} ).astype(int)
    # Remove all NULLS in the Fare column and create a new feature
    data['Fare'] = data['Fare'].fillna(train['Fare'].median())
    # Mapping Fare
    data.loc[ data['Fare'] <= 7.91, 'Fare'] = 0
    data.loc[(data['Fare'] > 7.91) & (data['Fare'] <= 14.454), 'Fare'] = 1
    data.loc[(data['Fare'] > 14.454) & (data['Fare'] <= 31), 'Fare'] = 2
    data.loc[ data['Fare'] > 31, 'Fare'] = 3
    data['Fare'] = data['Fare'].astype(int)
    # Cabin
    data.Cabin.fillna('0', inplace=True)
    data.loc[data.Cabin.str[0] == 'A', 'Cabin'] = 1
    data.loc[data.Cabin.str[0] == 'B', 'Cabin'] = 2
    data.loc[data.Cabin.str[0] == 'C', 'Cabin'] = 3
    data.loc[data.Cabin.str[0] == 'D', 'Cabin'] = 4
    data.loc[data.Cabin.str[0] == 'E', 'Cabin'] = 5
    data.loc[data.Cabin.str[0] == 'F', 'Cabin'] = 6
    data.loc[data.Cabin.str[0] == 'G', 'Cabin'] = 7
    data.loc[data.Cabin.str[0] == 'T', 'Cabin'] = 8
    data['Cabin'] = data['Cabin'].astype(int)
    # Fillna Age
    grouped = data.groupby(['Sex','Pclass', 'Title'])
    data['Age'] = grouped['Age'].apply(lambda x: x.fillna(x.median()))
    data['Age'] = data['Age'].astype(int)
    # select females and masters (boys)
    boy = (data['Name'].str.contains('Master')) | ((data['Sex']==0) & (data['Age']<13))
    female = data['Sex']==1
    boy_or_female = boy | female   
    # no. females + boys on ticket
    n_ticket = data[boy_or_female].groupby('Ticket').Survived.count()
    # survival rate amongst females + boys on ticket
    tick_surv = data[boy_or_female].groupby('Ticket').Survived.mean()
    data['Boy'] = (data['Name'].str.contains('Master')) | ((data['Sex']==0) & (data['Age']<13))   
    # if ticket exists in training data, fill NTicket with no. women+boys
    # on that ticket in the training data.
    data['NTicket'] = data['Ticket'].replace(n_ticket)
    # otherwise NTicket=0
    data.loc[~data.Ticket.isin(n_ticket.index),'NTicket']=0
    # if ticket exists in training data, fill TicketSurv with
    # women+boys survival rate in training data  
    data['TicketSurv'] = data['Ticket'].replace(tick_surv)
    # otherwise TicketSurv=0
    data.loc[~data.Ticket.isin(tick_surv.index),'TicketSurv']=0
    data['TicketSurv'].fillna(0, inplace=True)
    # Mapping Age
    data.loc[ data['Age'] <= 16, 'Age'] = 5
    data.loc[(data['Age'] > 16) & (data['Age'] <= 32), 'Age'] = 1
    data.loc[(data['Age'] > 32) & (data['Age'] <= 48), 'Age'] = 2
    data.loc[(data['Age'] > 48) & (data['Age'] <= 64), 'Age'] = 3
    data.loc[ data['Age'] > 64, 'Age'] = 4
    data['manual_tree'] = 0
    data.loc[boy_or_female, 'manual_tree'] = 1
    data.loc[(data['Sex'] == 1) & 
             (data['Pclass'] == 3) & 
             (data['Embarked'] == 0)  &
             (data['Fare'] > 0), 'manual_tree'] = 0
    data.loc[(data['Sex'] == 0) &
             (data['Title'] == 3), 'manual_tree'] = 1
    tfidf_vec = TfidfVectorizer(max_features=15, token_pattern="\w+")
    svd = TruncatedSVD(n_components=10)
    tfidf_array = svd.fit_transform(tfidf_vec.fit_transform(data["Name"]))
    for i in range(tfidf_array.shape[1]):
        data.insert(len(data.columns), column = 'Name_' + str(i), value = tfidf_array [:,i])
    tfidf_vec = TfidfVectorizer(max_features=5, analyzer="char")
    svd = TruncatedSVD(n_components=3)
    tfidf_array = svd.fit_transform(tfidf_vec.fit_transform(data["Ticket"]))
    for i in range(tfidf_array.shape[1]):
        data.insert(len(data.columns), column = 'Ticket_' + str(i), value = tfidf_array [:,i])
    data['Ticket'] = data['Ticket'].str.extract('(\d+)', expand=False).fillna(0).astype(float)
    data['Ticket'] = np.round(np.log1p(data['Ticket'])*10)
    data.drop(['Name'],1,inplace=True)
    return data.astype(float)

## Import data

In [7]:
train = pd.read_csv("../input/titanic/train.csv", dtype={"Age": np.float64}, index_col='PassengerId' )
test = pd.read_csv("../input/titanic/test.csv", dtype={"Age": np.float64}, index_col='PassengerId')

## Creating all variables needed for training
Train, target 

In [8]:
df = pd.concat((train,test),0)
target = train['Survived'].astype(float)
df = prep_data(df)

df['Ticket'] = df['Ticket'].astype(int).astype('category')

col_to_use = ['Embarked', 'Fare', 'Parch', 'Pclass', 'Sex', 'Age', 
              'NameLen', 'Has_Cabin', 'Cabin', 'FamilySize', 'isFamily', 
              'Title', 'TicketSurv', 'NTicket', 'Boy', 'manual_tree',
              'Ticket_0', 'Ticket_1', 'Ticket_2',
              'Name_0', 'Name_1', 'Name_2', 'Name_3', 'Name_4',
              'Name_5', 'Name_6', 'Name_7', 'Name_8', 'Name_9']

df = pd.get_dummies(df[col_to_use])
df[col_to_use] += 0.0

mungedtrain = df[:train.shape[0]].copy()
mungedtest = df[train.shape[0]:].copy()
mytrain = mungedtrain.values.tolist()
mytest = mungedtest.values.tolist()

## Training genetic program

In [9]:
GPhof = []

for n in [5,7,14,21,28,42,57]:
    hof, Tbox = gen_alg(mungedtrain, target, seed=n, mxvl=7, ngen=121)
    GPhof.append(hof)

   	      	               fitness                	             size             
   	      	--------------------------------------	------------------------------
gen	nevals	avg     	gen	max    	nevals	avg  	gen	max	nevals
0  	200   	0.579299	0  	0.79349	200   	3.985	0  	12 	200   
1  	159   	0.609057	1  	0.818182	159   	4.89 	1  	20 	159   
2  	141   	0.628468	2  	0.867565	141   	5.28 	2  	20 	141   
3  	159   	0.655623	3  	0.868687	159   	6.1  	3  	23 	159   
4  	153   	0.685853	4  	0.868687	153   	6.755	4  	22 	153   
5  	155   	0.709646	5  	0.868687	155   	6.49 	5  	18 	155   
6  	169   	0.70165 	6  	0.89899 	169   	6.515	6  	24 	169   
7  	148   	0.729607	7  	0.900112	148   	6.405	7  	20 	148   
8  	148   	0.740404	8  	0.900112	148   	6.12 	8  	19 	148   
9  	161   	0.719007	9  	0.900112	161   	6.32 	9  	18 	161   
10 	155   	0.741577	10 	0.900112	155   	6.42 	10 	18 	155   
11 	151   	0.751925	11 	0.901235	151   	6.45 	11 	18 	151   
12 	150   	0.753103	12 	0.901235	150   	7.025	1

2  	161   	0.644394	2  	0.872054	161   	5.43 	2  	24 	161   
3  	146   	0.664686	3  	0.895623	146   	6.085	3  	27 	146   
4  	153   	0.704371	4  	0.900112	153   	6.55 	4  	21 	153   
5  	162   	0.740988	5  	0.901235	162   	8.27 	5  	24 	162   
6  	157   	0.762688	6  	0.902357	157   	9.13 	6  	23 	157   
7  	167   	0.777469	7  	0.902357	167   	10.845	7  	25 	167   
8  	154   	0.814439	8  	0.902357	154   	12.145	8  	28 	154   
9  	151   	0.834214	9  	0.902357	151   	12.495	9  	30 	151   
10 	161   	0.844551	10 	0.902357	161   	14.035	10 	34 	161   
11 	158   	0.849276	11 	0.902357	158   	14.555	11 	29 	158   
12 	146   	0.845988	12 	0.902357	146   	14.985	12 	30 	146   
13 	163   	0.833019	13 	0.902357	163   	16.29 	13 	33 	163   
14 	151   	0.845791	14 	0.903479	151   	17.21 	14 	31 	151   
15 	162   	0.846768	15 	0.903479	162   	17.83 	15 	31 	162   
16 	146   	0.854214	16 	0.904602	146   	18.475	16 	34 	146   
17 	152   	0.847267	17 	0.904602	152   	17.925	17 	34 	152   
18 	162   	0.

3  	146   	0.643434	3  	0.868687	146   	8.7 	3  	30 	146   
4  	153   	0.67115 	4  	0.900112	153   	10.035	4  	31 	153   
5  	153   	0.706117	5  	0.900112	153   	9.47  	5  	32 	153   
6  	154   	0.737009	6  	0.900112	154   	7.625 	6  	32 	154   
7  	167   	0.75427 	7  	0.900112	167   	8.145 	7  	25 	167   
8  	162   	0.772935	8  	0.900112	162   	8.95  	8  	25 	162   
9  	152   	0.814489	9  	0.900112	152   	10.485	9  	23 	152   
10 	151   	0.815051	10 	0.900112	151   	12.145	10 	26 	151   
11 	152   	0.840864	11 	0.900112	152   	13.2  	11 	27 	152   
12 	149   	0.836476	12 	0.900112	149   	13.485	12 	35 	149   
13 	150   	0.83812 	13 	0.900112	150   	14.175	13 	35 	150   
14 	162   	0.828479	14 	0.900112	162   	14.735	14 	37 	162   
15 	148   	0.830527	15 	0.900112	148   	13.975	15 	37 	148   
16 	158   	0.82876 	16 	0.900112	158   	14.935	16 	39 	158   
17 	146   	0.847009	17 	0.900112	146   	15.27 	17 	35 	146   
18 	150   	0.835629	18 	0.900112	150   	15.92 	18 	40 	150   
19 	157   

6  	157   	0.750752	6  	0.900112	157   	6.37 	6  	20 	157   
7  	135   	0.78009 	7  	0.900112	135   	6.335	7  	20 	135   
8  	150   	0.768204	8  	0.900112	150   	6.265	8  	21 	150   
9  	147   	0.792565	9  	0.901235	147   	7.2  	9  	18 	147   
10 	151   	0.795208	10 	0.900112	151   	6.87 	10 	21 	151   
11 	162   	0.796341	11 	0.900112	162   	7.605	11 	19 	162   
12 	148   	0.7989  	12 	0.901235	148   	7.01 	12 	15 	148   
13 	168   	0.781027	13 	0.901235	168   	6.84 	13 	18 	168   
14 	153   	0.798277	14 	0.901235	153   	7.21 	14 	16 	153   
15 	152   	0.786706	15 	0.902357	152   	7.49 	15 	21 	152   
16 	153   	0.804764	16 	0.902357	153   	8.11 	16 	19 	153   
17 	155   	0.808013	17 	0.902357	155   	8.64 	17 	21 	155   
18 	166   	0.822767	18 	0.902357	166   	9.785	18 	25 	166   
19 	154   	0.832974	19 	0.902357	154   	10.09	19 	30 	154   
20 	141   	0.8428  	20 	0.902357	141   	10.96	20 	28 	141   
21 	154   	0.842514	21 	0.902357	154   	12.15	21 	27 	154   
22 	149   	0.847183	22 	

10 	156   	0.815836	10 	0.901235	156   	12.42 	10 	29 	156   
11 	137   	0.84064 	11 	0.901235	137   	13.145	11 	31 	137   
12 	153   	0.820022	12 	0.902357	153   	12.295	12 	27 	153   
13 	153   	0.826588	13 	0.902357	153   	13.31 	13 	30 	153   
14 	139   	0.82633 	14 	0.902357	139   	13.395	14 	35 	139   
15 	164   	0.833513	15 	0.902357	164   	15.045	15 	31 	164   
16 	154   	0.83858 	16 	0.902357	154   	15.97 	16 	33 	154   
17 	162   	0.835281	17 	0.903479	162   	17.03 	17 	40 	162   
18 	155   	0.837789	18 	0.903479	155   	18.335	18 	34 	155   
19 	152   	0.837649	19 	0.903479	152   	20.55 	19 	41 	152   
20 	165   	0.842043	20 	0.903479	165   	21.105	20 	36 	165   
21 	147   	0.837452	21 	0.903479	147   	22.44 	21 	37 	147   
22 	149   	0.853799	22 	0.903479	149   	22.3  	22 	37 	149   
23 	151   	0.848356	23 	0.903479	151   	22.64 	23 	49 	151   
24 	155   	0.852379	24 	0.903479	155   	23.635	24 	43 	155   
25 	168   	0.859383	25 	0.903479	168   	23.405	25 	34 	168   
26 	146 

14 	152   	0.752688	14 	0.875421	152   	5.575	14 	18 	152   
15 	153   	0.75693 	15 	0.900112	153   	5.865	15 	27 	153   
16 	153   	0.770185	16 	0.900112	153   	7.125	16 	19 	153   
17 	157   	0.782772	17 	0.900112	157   	7.335	17 	21 	157   
18 	161   	0.795926	18 	0.900112	161   	8.93 	18 	21 	161   
19 	162   	0.807896	19 	0.900112	162   	9.53 	19 	24 	162   
20 	148   	0.830527	20 	0.900112	148   	10.68	20 	20 	148   
21 	139   	0.838328	21 	0.900112	139   	10.475	21 	22 	139   
22 	152   	0.843513	22 	0.900112	152   	11.685	22 	25 	152   
23 	158   	0.821571	23 	0.900112	158   	11.21 	23 	22 	158   
24 	153   	0.83578 	24 	0.900112	153   	11.29 	24 	25 	153   
25 	166   	0.81963 	25 	0.900112	166   	11.295	25 	23 	166   
26 	156   	0.836689	26 	0.901235	156   	11.415	26 	23 	156   
27 	144   	0.834299	27 	0.901235	144   	12.07 	27 	26 	144   
28 	149   	0.848822	28 	0.901235	149   	12.14 	28 	24 	149   
29 	149   	0.831251	29 	0.901235	149   	11.565	29 	25 	149   
30 	166   	0.83

18 	151   	0.811914	18 	0.901235	151   	12.255	18 	28 	151   
19 	163   	0.806066	19 	0.901235	163   	12.455	19 	27 	163   
20 	149   	0.8289  	20 	0.901235	149   	13.25 	20 	30 	149   
21 	145   	0.830196	21 	0.901235	145   	12.25 	21 	31 	145   
22 	154   	0.82133 	22 	0.901235	154   	12.05 	22 	24 	154   
23 	150   	0.830847	23 	0.901235	150   	12.25 	23 	24 	150   
24 	149   	0.817464	24 	0.901235	149   	12.465	24 	25 	149   
25 	152   	0.814501	25 	0.901235	152   	12.93 	25 	26 	152   
26 	171   	0.8094  	26 	0.901235	171   	12.07 	26 	24 	171   
27 	154   	0.828608	27 	0.901235	154   	12.35 	27 	24 	154   
28 	159   	0.839063	28 	0.901235	159   	12.38 	28 	24 	159   
29 	149   	0.823889	29 	0.902357	149   	12.565	29 	33 	149   
30 	157   	0.831874	30 	0.902357	157   	12.615	30 	32 	157   
31 	155   	0.827222	31 	0.902357	155   	12.725	31 	27 	155   
32 	154   	0.823782	32 	0.902357	154   	13.145	32 	26 	154   
33 	146   	0.830842	33 	0.902357	146   	13.475	33 	29 	146   
34 	147 

23 	153   	0.840572	23 	0.902357	153   	18.69 	23 	33 	153   
24 	154   	0.836577	24 	0.902357	154   	18.75 	24 	35 	154   
25 	153   	0.862217	25 	0.902357	153   	19.105	25 	37 	153   
26 	145   	0.858109	26 	0.902357	145   	18.985	26 	42 	145   
27 	173   	0.84055 	27 	0.902357	173   	18.24 	27 	34 	173   
28 	161   	0.839198	28 	0.902357	161   	17.74 	28 	35 	161   
29 	158   	0.858373	29 	0.902357	158   	17.96 	29 	34 	158   
30 	160   	0.848485	30 	0.902357	160   	18.8  	30 	44 	160   
31 	161   	0.854703	31 	0.902357	161   	19.98 	31 	44 	161   
32 	163   	0.841336	32 	0.902357	163   	20.16 	32 	44 	163   
33 	162   	0.845971	33 	0.902357	162   	20.495	33 	45 	162   
34 	150   	0.846689	34 	0.902357	150   	22.105	34 	49 	150   
35 	160   	0.839411	35 	0.902357	160   	22.14 	35 	44 	160   
36 	149   	0.860836	36 	0.902357	149   	23.895	36 	45 	149   
37 	161   	0.836178	37 	0.902357	161   	24.785	37 	49 	161   
38 	143   	0.864512	38 	0.902357	143   	25.115	38 	44 	143   
39 	156 

In [10]:
test = test.reset_index()

## True labels for test data 
Thanks Tarun Paparaju - https://www.kaggle.com/tarunpaparaju for this method

You can find original notebook here - https://www.kaggle.com/tarunpaparaju/titanic-competition-how-top-lb-got-their-score

In [11]:
test_data_with_labels = pd.read_csv('../input/titanic-test-data/titanic.csv')
test_data = pd.read_csv('../input/titanic/test.csv')
for i, name in enumerate(test_data_with_labels['name']):
    if '"' in name:
        test_data_with_labels['name'][i] = re.sub('"', '', name)
        
for i, name in enumerate(test_data['Name']):
    if '"' in name:
        test_data['Name'][i] = re.sub('"', '', name)
survived = []

for name in test_data['Name']:
    survived.append(int(test_data_with_labels.loc[test_data_with_labels['name'] == name]['survived'].values[-1]))

## Predictions

In [12]:
testPredictions = np.zeros((len(GPhof),test.shape[0]))
for n in range(len(GPhof)):
    GPfunc = Tbox.compile(expr=GPhof[n])
    testPredictions[n] += Outputs(np.array([GPfunc(*x) for x in mytest]))
    print("Score {}:".format(n),accuracy_score(survived,testPredictions[n]))
    print(GPhof[n])

Score 0: 0.7464114832535885
pDiv(sub(tanh(min(sin(pPow(Cabin, TicketSurv)), tanh(pPow(Name_0, Boy)))), TicketSurv), pSqrt(Title))
Score 1: 0.7703349282296651
neg(max(mul(mul(pDiv(cos(NameLen), add(NTicket, Name_6)), pDiv(min(min(TicketSurv, Title), sub(NTicket, Name_1)), add(min(TicketSurv, FamilySize), Name_6))), Cabin), max(mul(mul(pDiv(cos(NameLen), add(TicketSurv, Name_6)), pDiv(min(NTicket, Embarked), add(NTicket, NameLen))), Cabin), mul(TicketSurv, max(Title, mul(manual_tree, manual_tree))))))
Score 2: 0.7416267942583732
pDiv(pSqrt(pPow(pPow(Title, Name_0), add(Name_0, add(Cabin, Name_9)))), neg(min(pSqrt(pPow(add(Sex, neg(Pclass)), sub(neg(TicketSurv), neg(Has_Cabin)))), min(pPow(Pclass, pPow(Name_7, neg(Name_0))), pSqrt(TicketSurv)))))
Score 3: 0.7344497607655502
pDiv(TicketSurv, neg(max(min(pSqrt(max(Title, Name_5)), Embarked), max(Title, pDiv(max(Title, pDiv(Cabin, Ticket_2)), min(cos(FamilySize), pDiv(Parch, Boy)))))))
Score 4: 0.7440191387559809
pDiv(add(add(add(add(Name_7,

## Cheking score

In [13]:
testPrediction = np.round(np.mean(testPredictions,axis=0)).astype(int)
print("Score :",accuracy_score(survived,testPrediction))
test['Survived'] = testPrediction
test[['PassengerId','Survived']].to_csv('gp_submit.csv', index=False)

Score : 0.7464114832535885


Thanks to Paulo Pinto - https://www.kaggle.com/paulorzp - https://www.kaggle.com/paulorzp/titanic-gp-model-training

Thanks to Ashwini Swain - https://www.kaggle.com/ash316 - https://www.kaggle.com/ash316/eda-to-prediction-dietanic