# Titanic - Genetic Algorithm

In [1]:
import random

import numpy as np
import pandas as pd

from nuances import PassengerNuance

## Attributes and Nuances

- **Attribute X1: PassengerId** → 0..200 **(X11)**, 200..400 **(X12)**, 400..600 **(X13)**, 600..800 **(X14)**, >800 **(X15)**;
- **Attribute X2: Pclass (Ticket class)** → 1 **(X21)**, 2 **(X22)** or 3 **(X23)**;
- **Attribute X3: Sex** → male **(X31)**, female **(X32)**;
- **Attribute X4: Age** → 0..10 **(X41)**, 10..20 **(X42)**, 20..30 **(X43)**, 30..40 **(X44)**, 40..50 **(X45)**, 50..60 **(X46)**, 60..70 **(X47)**, >70 **(X48)**;
- **Attribute X5: SibSp (# of siblings / spouses aboard the Titanic)** → 0..2 **(X51)**, 2..4 **(X52)**, 4..6 **(X53)**, >6 **(X54)**;
- **Attribute X6: Parch (# of parents / children aboard the Titanic)** → 0..2 **(X61)**, 2..4 **(X62)**, 4..6 **(X63)**;
- **Attribute X7: Fare** → 0..100 **(X71)**, 100..200 **(X72)**, 200..300 **(X73)**, 300..400 **(X74)**, >400 **(X75)**;
- **Attribute X8: Embarked (Port of Embarkation)** → C **(X81)**, Q **(X82)**, S **(X83)**, OTHER **(X83)**;
- **Attribute X9: Cabin** → A **(X91)**, B **(X92)**, C **(X93)**, D **(X94)**, E **(X94)**, F **(X95)**, OTHER **(X96)**;

In [2]:
train_data = pd.read_csv("train.csv")
train_data.set_index(keys=["PassengerId"], inplace=True)

ATTRIBUTES_NUMBER = 37
# GENES_SAMPLE = len(train_data)
GENES_SAMPLE = 50

In [3]:
train_data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Categorizing Data

In [4]:
def gene_data(df):
    df = df.copy()
    genes_data_raw = []
    for index, row in df.iterrows():
        data = {
            "PassengerId": index,
            "Survived": row["Survived"]
        }
        # data.update(PassengerNuance.pid(index))
        data.update(PassengerNuance.pclass(row["Pclass"]))
        data.update(PassengerNuance.sex(row["Sex"]))
        data.update(PassengerNuance.age(row["Age"]))
        data.update(PassengerNuance.sibsp(row["SibSp"]))
        data.update(PassengerNuance.parch(row["Parch"]))
        data.update(PassengerNuance.fare(row["Fare"]))
        data.update(PassengerNuance.embarked(row["Embarked"]))
        data.update(PassengerNuance.cabin(row["Cabin"]))
        genes_data_raw.append(data)

    genes_df = pd.DataFrame(genes_data_raw)
    genes_df.set_index(keys=["PassengerId"], inplace=True)
    return genes_df

genes = gene_data(train_data)
genes.head()

Unnamed: 0_level_0,Survived,X21,X22,X23,X24,X31,X32,X41,X42,X43,...,X82,X83,X84,X91,X92,X93,X94,X95,X96,X97
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0.0,0.0,1.0,0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1,1.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1,0.0,0.0,1.0,0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,1.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,0,0.0,0.0,1.0,0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### Formula for Classification

$ b_0 + \displaystyle\sum_{a=1}^{n} [b_n * x_n] $

## Defining the Random Variables

In [5]:
def holland_classifier_constants():
    s = random.uniform(-1, 2)
    v = np.array([-1 + 2 * random.uniform(0, 1) for _ in range (0, ATTRIBUTES_NUMBER)])
    return s, v

## First Run

In [6]:
s, v = holland_classifier_constants()

def generate_chromosome(df, scalar, vector):
    df = df.copy()
    gene_rows = df.filter(like="X")
    for row, gene in zip(gene_rows, vector):
        df[row] = df[row] * gene

    df["Result"] = scalar + df.filter(like="X").sum(axis=1)
    return df
 
calculated_chromosomes = generate_chromosome(df=genes.sample(GENES_SAMPLE), scalar=s, vector=v)
calculated_chromosomes.head()

Unnamed: 0_level_0,Survived,X21,X22,X23,X24,X31,X32,X41,X42,X43,...,X83,X84,X91,X92,X93,X94,X95,X96,X97,Result
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
159,0,-0.0,-0.0,-0.24073,0.0,-0.188086,0.0,0.0,0.0,0.0,...,0.017092,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.306908,0.965507
675,0,-0.0,-0.342847,-0.0,0.0,-0.188086,0.0,0.0,0.0,0.0,...,0.017092,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.306908,-0.324089
176,0,-0.0,-0.0,-0.24073,0.0,-0.188086,0.0,0.0,0.992091,0.0,...,0.017092,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.306908,1.060044
552,0,-0.0,-0.342847,-0.0,0.0,-0.188086,0.0,0.0,0.0,0.535216,...,0.017092,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.306908,2.362952
289,1,-0.0,-0.342847,-0.0,0.0,-0.188086,0.0,0.0,0.0,0.0,...,0.017092,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.306908,1.484073


In [7]:
def get_separator():
    return 0

separator = get_separator()
separator

0

## Guessing

In [8]:
def guess(df, separator):
    df = df.copy()
    for index, row in df.iterrows():
        if row["Result"] >= separator:
            df.at[index,'SurvivedGuess'] = 1
        else:
            df.at[index,'SurvivedGuess'] = 0
    df["SurvivedGuess"] = df["SurvivedGuess"].astype(int)
    return df

guessed_attempt = guess(df=calculated_chromosomes, separator=separator)
guessed_attempt.head()

Unnamed: 0_level_0,Survived,X21,X22,X23,X24,X31,X32,X41,X42,X43,...,X84,X91,X92,X93,X94,X95,X96,X97,Result,SurvivedGuess
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
159,0,-0.0,-0.0,-0.24073,0.0,-0.188086,0.0,0.0,0.0,0.0,...,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.306908,0.965507,1
675,0,-0.0,-0.342847,-0.0,0.0,-0.188086,0.0,0.0,0.0,0.0,...,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.306908,-0.324089,0
176,0,-0.0,-0.0,-0.24073,0.0,-0.188086,0.0,0.0,0.992091,0.0,...,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.306908,1.060044,1
552,0,-0.0,-0.342847,-0.0,0.0,-0.188086,0.0,0.0,0.0,0.535216,...,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.306908,2.362952,1
289,1,-0.0,-0.342847,-0.0,0.0,-0.188086,0.0,0.0,0.0,0.0,...,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.306908,1.484073,1


In [9]:
def guessed_data(df):
    df = df.copy()
    survived_data = df.query("Survived == 1")
    not_survived_data = df.query("Survived == 0")
    
    right_guess = df.query("Survived == SurvivedGuess")
    
    survived_data_guess = right_guess.query("SurvivedGuess == 1")
    not_survived_data_guess = right_guess.query("SurvivedGuess == 0")

    return {
        "all_data": df,
        "survived_data": survived_data,
        "not_survived_data": not_survived_data,
        "right_guess": right_guess,
        "survived_data_guess": survived_data_guess,
        "not_survived_data_guess": not_survived_data_guess,
    }


guessed = guessed_data(guessed_attempt)

In [10]:
def calculate_fitness(df):
    df = df.copy()
    # return len(df["right_guess"]) / len(df["all_data"])
    return (len(df["survived_data_guess"]) * len(df["not_survived_data_guess"])) / (len(df["survived_data"]) * len(df["not_survived_data"]))

fitness = calculate_fitness(df=guessed)
print(f"Guessed {fitness * 100}% of the sample")

Guessed 6.451612903225806% of the sample


In [11]:
guessed["right_guess"].sample(2)

Unnamed: 0_level_0,Survived,X21,X22,X23,X24,X31,X32,X41,X42,X43,...,X84,X91,X92,X93,X94,X95,X96,X97,Result,SurvivedGuess
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
316,1,-0.0,-0.0,-0.24073,0.0,-0.0,0.031968,0.0,0.0,0.535216,...,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.306908,2.685123,1
306,1,-0.252485,-0.0,-0.0,0.0,-0.188086,0.0,0.462894,0.0,0.0,...,-0.0,0.0,0.0,0.339589,0.0,0.0,-0.0,0.0,2.22828,1


## Generate the chromosomes' database

In [12]:
chromosomes = []
for i in range(6):
    s, v = holland_classifier_constants()
    
    # Calculate the genes
    g = generate_chromosome(df=genes.sample(GENES_SAMPLE), scalar=s, vector=v)

    # separator = get_separator(df=g)

    # Guess attempt
    ga = guess(df=g, separator=separator)

    # Get the guess overview
    gd = guessed_data(ga)

    # Calculate fitness
    f = calculate_fitness(gd)

    data = {
        "fitness": f,
        "scalar": s,
        "vector": v
    }

    chromosomes.append(data)

In [13]:
dad, mom = random.sample(chromosomes, 2)
print(f"Dad: {dad['fitness']}, Mom: {mom['fitness']}")

Dad: 0.0, Mom: 0.15625


## Generate 3 children

In [14]:
# Next steps:
# 1. Choose 1 random column per child;
# 2. The first column will have dad genes and the last, mom genes
# 3. Generate 3 children
# 4. Randomly select 1 gene for each child from the "chromosomes" list
# 5. Select the 2 children with the highest score and replace with those from the "chromosomes" list with the lowest score
# 6. Chose a new dad and mom
# 7. Run all the processes again

In [15]:
def generate_child(df, mom, dad, chromosomes):
    df = df.copy()

    dad_scalar = dad["scalar"]
    mom_scalar = mom["scalar"]

    dad_vector = dad["vector"]
    mom_vector = mom["vector"]
    
    chromosomes_without_parents = chromosomes.copy()
    chromosomes_without_parents.remove(dad)
    chromosomes_without_parents.remove(mom)
    
    random_chromosome = random.choice(chromosomes)

    start_from_gene = random.randint(0, ATTRIBUTES_NUMBER) - 1
    random_gene = random.randint(0, ATTRIBUTES_NUMBER)
    
    child_vector = np.concatenate((dad_vector[start_from_gene:], mom_vector[:start_from_gene]), axis=None)

    if random_gene == ATTRIBUTES_NUMBER:
        scalar = random_chromosome["scalar"]
    else:
        scalar = random.choice([dad_scalar, mom_scalar])
        child_vector[random_gene - 1] = random_chromosome["vector"][random_gene - 1]
    
    return generate_chromosome(df=df.sample(GENES_SAMPLE), scalar=scalar, vector=child_vector)

children = [generate_child(df=genes, mom=mom, dad=dad, chromosomes=chromosomes) for _ in range(3)]

def classify_children(children_data):
    children_data = children_data.copy()
    classified_children = []
    for child in children:
        # separator = get_separator(df=child)
        
        ga = guess(df=child, separator=separator)
    
        # Get the guess overview
        gd = guessed_data(ga)
    
        # Calculate fitness
        f = calculate_fitness(gd)
    
        data = {
            "fitness": f,
            "scalar": s,
            "vector": v
        }
    
        classified_children.append(data)
    return classified_children

classified_children = classify_children(children_data=children)

In [16]:
def get_highest_score_data(children_data):
    return sorted(children_data, key=lambda x: x["fitness"], reverse=True)[:2]

def get_lowest_score_data(chromosomes_data):
    return sorted(chromosomes_data, key=lambda x: x["fitness"])[:2]

def replace_with_children(chromosomes_data, children_data):
    chromosomes_data = chromosomes_data.copy()
    parent_1, parent_2 = get_lowest_score_data(chromosomes_data=chromosomes_data)
    child_1, child_2 = get_highest_score_data(children_data=children_data)

    if child_1["fitness"] >= parent_1["fitness"]:
        print("Changed Parent 1")
        chromosomes_data[chromosomes_data.index(parent_1)] = child_1
    if child_2["fitness"] >= parent_2["fitness"]:
        print("Changed Parent 2")
        chromosomes_data[chromosomes_data.index(parent_2)] = child_2

    return sorted(chromosomes_data, key=lambda x: x["fitness"], reverse=True)

# Update chromosomes database
chromosomes = replace_with_children(chromosomes_data=chromosomes, children_data=classified_children)

Changed Parent 1
Changed Parent 2


In [17]:
fitness = 0
run = 0
try:
    while True:
        dad, mom = random.sample(chromosomes, 2)
        children = [generate_child(df=genes, mom=mom, dad=dad, chromosomes=chromosomes) for _ in range(3)]
        classified_children = classify_children(children_data=children)
    
        chromosomes = replace_with_children(chromosomes_data=chromosomes, children_data=classified_children)
        fitness = chromosomes[0]["fitness"] * 100
        run += 1
        print(f"Current fitness: {fitness}. Run number: {run}.")
except KeyboardInterrupt:
    print(f"Fitness: {chromosomes[0]['fitness']}. Scalar {chromosomes[0]['scalar']}. Vector {chromosomes[0]['vector']}")

Changed Parent 1
Current fitness: 48.538961038961034. Run number: 1.
Changed Parent 1
Current fitness: 48.538961038961034. Run number: 2.
Current fitness: 48.538961038961034. Run number: 3.
Changed Parent 1
Current fitness: 48.538961038961034. Run number: 4.
Changed Parent 1
Current fitness: 48.538961038961034. Run number: 5.
Current fitness: 48.538961038961034. Run number: 6.
Changed Parent 1
Current fitness: 48.538961038961034. Run number: 7.
Changed Parent 1
Current fitness: 48.538961038961034. Run number: 8.
Current fitness: 48.538961038961034. Run number: 9.
Changed Parent 1
Current fitness: 48.538961038961034. Run number: 10.
Changed Parent 1
Current fitness: 48.538961038961034. Run number: 11.
Current fitness: 48.538961038961034. Run number: 12.
Current fitness: 48.538961038961034. Run number: 13.
Current fitness: 48.538961038961034. Run number: 14.
Changed Parent 1
Current fitness: 48.538961038961034. Run number: 15.
Current fitness: 48.538961038961034. Run number: 16.
Current 

In [18]:
cc = generate_chromosome(df=genes, scalar=chromosomes[0]['scalar'], vector=chromosomes[0]['vector'])
ga = guess(df=cc, separator=separator)
gd = guessed_data(df=ga)
len(gd["right_guess"])

336

In [19]:
chromosomes[0]

{'fitness': 0.7606112054329371,
 'scalar': 1.4849522937323143,
 'vector': array([-0.13388169, -0.37025793,  0.24121009,  0.85047061,  0.9198024 ,
        -0.05080894, -0.91224979, -0.83843908, -0.83476502, -0.43223271,
         0.98736047,  0.83807356, -0.199424  ,  0.26604513,  0.78692495,
        -0.86861359,  0.11313268,  0.39160941, -0.85000981,  0.77404874,
        -0.40798302, -0.57130446, -0.85437354, -0.63028635, -0.70729245,
        -0.41404743,  0.9470761 , -0.83380225,  0.628591  ,  0.26580437,
        -0.05743832,  0.39703306, -0.52745294,  0.92438152,  0.13463441,
        -0.38648807,  0.57390987])}