In [14]:
import random
import operator
import itertools

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from deap import gp

In [4]:
# peek at features
# TODO: set passenger id as index and filter out the target feature
df = pd.read_csv("clean_train_data.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Deck
0,1,0,3,0,22.0,1,0,7.25,2,3
1,2,1,1,1,38.0,1,0,71.2833,0,0
2,3,1,3,1,26.0,0,0,7.925,2,3
3,4,1,1,1,35.0,1,0,53.1,2,0
4,5,0,3,0,35.0,0,0,8.05,2,3


In [2]:
# create fitness for MO: FP and FN
creator.create("FitnessMin", base.Fitness, weights=(-1.0,-1.0))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMin)

In [6]:
def divide_by_zero(a, b):
    try: 
        return a / b
    except ZeroDivisionError: 
        return 0

In [7]:
def if_then_else(input, output1, output2):
    if input: 
        return output1
    else: 
        return output2

In [12]:
# TODO: func for float to bool

In [9]:
# Create primitives
pset = gp.PrimitiveSetTyped("main", itertools.repeat(float, 8), bool)

pset.addPrimitive(np.add, [float, float], float)
pset.addPrimitive(np.subtract, [float, float], float)
pset.addPrimitive(np.multiply, [float, float], float)
pset.addPrimitive(divide_by_zero, [float, float], float)

pset.addPrimitive(operator.and_, [bool, bool], bool)
pset.addPrimitive(operator.or_, [bool, bool], bool)
pset.addPrimitive(operator.not_, [bool], bool)
pset.addPrimitive(operator.xor, [bool, bool], bool)

pset.addPrimitive(operator.lt, [float, float], bool)
pset.addPrimitive(operator.eq, [float, float], bool)
pset.addPrimitive(if_then_else, [bool, float, float], float)

In [13]:
toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=2)
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("compile", gp.compile, pset=pset)

In [16]:
def eval_function(individual, points, pset):
    func = gp.compile(expr=individual, pset=pset)
    results = np.array([func(*points[x][:8]) for x in range(len(points))])
    truth = df["Survived"].to_numpy()
    tn, fp, fn, tp = confusion_matrix(truth, results).ravel()
    return fp/(fp+tp),fn/(fn+tn)

In [17]:
toolbox.register("evaluate", eval_function, points=df.loc[:, df.columns != ["Survived", "PassengerId"]], pset=pset)


ValueError: Arrays were different lengths: 10 vs 2