In [None]:
import random
import pandas as pd
import numpy as np

CHROMOSOME_NUMBER = 128
CHILDREN_NUMBER = 32
ITERATIONS = 250
MUTATION_NUMBER = 1

In [None]:
def chromosomeFitness(chromosome, data):
    sample_size = len(data)
    sample = data.sample(n=sample_size)
    sample['estimated_survived'] = 0
    sample_survived = sample['Survived'].sum()
    sample_died = sample_size - sample_survived

    binary_values = sample['binary_values'].to_numpy()
    coefficients = np.array(chromosome)

    # Ensure that the coefficients array has the same length as binary_values
    if len(coefficients) > len(binary_values[0]):
        coefficients = coefficients[:-1]
    elif len(coefficients) < len(binary_values[0]):
        coefficients = np.concatenate((coefficients, np.array([0])))

    estimated_survived = np.sum(np.array([np.array(
        list(binary), dtype=int) * coefficients for binary in binary_values]), axis=1)
    sample['estimated_survived'] = np.where(estimated_survived > 0, 1, 0)

    correctly_estimated_survived = (
        (sample['estimated_survived'] == sample['Survived']) & (sample['Survived'] == 1)).sum()
    correctly_estimated_died = ((sample['estimated_survived'] == sample['Survived']) & (
        sample['Survived'] == 0)).sum()

    fitness = (correctly_estimated_survived *
               correctly_estimated_died) / (sample_survived * sample_died)

    return fitness

In [None]:
def formatData(columns):
    survived = columns['Survived']
    # NORMALIZE SEX COLUMN VALUES
    columns = pd.get_dummies(columns, columns=['Sex'], dtype=float)
    sex_columns = columns.filter(like='Sex')

    # NORMALIZE AGE COLUMN VALUES
    max_age = columns['Age'].max()
    age_intervals = pd.interval_range(start=0, end=max_age, freq=10)
    columns['Age'] = pd.cut(
        columns['Age'], bins=age_intervals, labels=False, include_lowest=True)
    age_dummies = pd.get_dummies(columns['Age'], prefix='Age', dtype=float)

    # NORMALIZE PCLASS COLUMN VALUES
    pclass_dummies = pd.get_dummies(
        columns['Pclass'], prefix='Pclass', dtype=float)

    # NORMALIZE SIBSP COLUMN VALUES
    sibsp_dummies = pd.get_dummies(
        columns['SibSp'], prefix='SibSp', dtype=float)

    # NORMALIZE PARCH COLUMN VALUES
    parch_dummies = pd.get_dummies(
        columns['Parch'], prefix='Parch', dtype=float)

    # NORMALIZE EMBARKED COLUMN VALUES
    embarked_dummies = pd.get_dummies(
        columns['Embarked'], prefix='Embarked', dtype=float)

    # NORMALIZE FARE COLUMN VALUES
    max_fare = columns['Fare'].max()
    fare_intervals = pd.interval_range(start=0, end=max_fare, freq=5)
    columns['Fare'] = pd.cut(
        columns['Fare'], bins=fare_intervals, labels=False, include_lowest=True)
    fare_dummies = pd.get_dummies(columns['Fare'], prefix='Fare', dtype=float)

    # Combine all the normalized columns
    columns = pd.concat([sex_columns, age_dummies,
                        pclass_dummies, sibsp_dummies, parch_dummies, embarked_dummies, fare_dummies], axis=1)

    # SLUG DATA INTO STRING OF BINARY VALUES
    binary_values = columns.astype(bool).astype(int).astype(str)
    columns['binary_values'] = binary_values.apply(
        lambda x: ''.join(x), axis=1)

    columns['Survived'] = survived

    print(columns)

    # # Reorder the columns to have 'binary_values' and 'Survived' as the last two columns
    # columns = columns[[col for col in columns if col !=
    #                    'binary_values' and col != 'Survived'] + ['binary_values', 'Survived']]

    return columns

In [None]:
def formatTestData(columns):
    passenger_ids = columns['PassengerId']
    columns.drop('PassengerId', axis=1, inplace=True)

    # NORMALIZE SEX COLUMN VALUES
    columns = pd.get_dummies(columns, columns=['Sex'], dtype=float)
    sex_columns = columns.filter(like='Sex')

    # NORMALIZE AGE COLUMN VALUES
    age_intervals = pd.interval_range(start=0, end=100, freq=10)
    columns['Age'] = pd.cut(
        columns['Age'], bins=age_intervals, labels=False, include_lowest=True)
    age_dummies = pd.get_dummies(columns['Age'], prefix='Age', dtype=float)

    # NORMALIZE PCLASS COLUMN VALUES
    pclass_dummies = pd.get_dummies(
        columns['Pclass'], prefix='Pclass', dtype=float)

    # NORMALIZE SIBSP COLUMN VALUES
    sibsp_dummies = pd.get_dummies(
        columns['SibSp'], prefix='SibSp', dtype=float)

    # NORMALIZE PARCH COLUMN VALUES
    parch_dummies = pd.get_dummies(
        columns['Parch'], prefix='Parch', dtype=float)

    # NORMALIZE EMBARKED COLUMN VALUES
    embarked_dummies = pd.get_dummies(
        columns['Embarked'], prefix='Embarked', dtype=float)

    # NORMALIZE FARE COLUMN VALUES
    max_fare = columns['Fare'].max()
    fare_intervals = pd.interval_range(start=0, end=max_fare, freq=5)
    columns['Fare'] = pd.cut(
        columns['Fare'], bins=fare_intervals, labels=False, include_lowest=True)
    fare_dummies = pd.get_dummies(columns['Fare'], prefix='Fare', dtype=float)

    # Combine all the normalized columns
    columns = pd.concat([sex_columns, age_dummies,
                        pclass_dummies, sibsp_dummies, parch_dummies, embarked_dummies, fare_dummies], axis=1)

    # SLUG DATA INTO STRING OF BINARY VALUES
    binary_values = columns.astype(bool).astype(int).astype(str)
    columns['binary_values'] = binary_values.apply(
        lambda x: ''.join(x), axis=1)

    # Reorder the columns to have 'binary_values' as the last column
    columns = columns[[col for col in columns if col !=
                       'binary_values'] + ['binary_values']]
    columns['PassengerId'] = passenger_ids
    print(columns)

    return columns


In [None]:
def generateChromossomes(chromosome_number, binary_values_length):
    chromosomes = []
    for _ in range(0, chromosome_number):
        coefficients = []

        for _ in range(0, binary_values_length):
            rand = random.uniform(-1, 1)
            coefficients.append(rand)
        chromosomes.append({'coefficients': coefficients, 'binary_values': ''})

    return chromosomes

In [None]:
def chromosomeFitness(chromosome, data):
    sample_size = len(data)
    sample = data.sample(n=sample_size)
    sample['estimated_survived'] = 0
    sample_survived = sample['Survived'].sum()
    sample_died = sample_size - sample_survived

    binary_values = sample['binary_values'].to_numpy()
    coefficients = np.array(chromosome)

    # Ensure that the coefficients array has the same length as binary_values
    if len(coefficients) > len(binary_values[0]):
        coefficients = coefficients[:-1]
    elif len(coefficients) < len(binary_values[0]):
        coefficients = np.concatenate((coefficients, np.array([0])))

    estimated_survived = np.sum(np.array([np.array(
        list(binary), dtype=int) * coefficients for binary in binary_values]), axis=1)
    sample['estimated_survived'] = np.where(estimated_survived > 0, 1, 0)

    correctly_estimated_survived = (
        (sample['estimated_survived'] == sample['Survived']) & (sample['Survived'] == 1)).sum()
    correctly_estimated_died = ((sample['estimated_survived'] == sample['Survived']) & (
        sample['Survived'] == 0)).sum()

    fitness = (correctly_estimated_survived *
               correctly_estimated_died) / (sample_survived * sample_died)

    return fitness

In [None]:
def tournamentSelection(chromosomes, fitnesses_percentages, tournament_size, data):
    tournament_chromosomes = random.choices(
        chromosomes, k=tournament_size, weights=fitnesses_percentages)
    tournament_fitnesses = [chromosomeFitness(
        chromosome['coefficients'], data) for chromosome in tournament_chromosomes]
    winner_index = tournament_fitnesses.index(max(tournament_fitnesses))
    return tournament_chromosomes[winner_index]


In [None]:
def formatAndExportData(best_chromosome):
    test_data = pd.read_csv("./titanic/test.csv")
    test = test_data[['PassengerId', 'Pclass',
                      'Sex', 'Age', 'SibSp', 'Parch', 'Embarked', 'Fare']]
    formatted_test_data = formatTestData(test)

    attributes = list(formatted_test_data['binary_values'])
    test_data_length = len(attributes)

    for i in range(test_data_length):
        result = 0
        for j, coeff in enumerate(best_chromosome['coefficients']):
            result += coeff * int(attributes[i][j])
        formatted_test_data.at[i, 'Sum'] = int(result)
        formatted_test_data.at[i, 'Survived'] = np.where(result > 0, 1, 0)

    # Convert 'Survived' column to integers
    formatted_test_data['Survived'] = formatted_test_data['Survived'].astype(
        int)

    # Export the formatted data to "results.csv"
    formatted_test_data[['PassengerId', 'Survived']].to_csv(
        'results.csv', index=False, header=['PassengerId', 'Survived'])

In [None]:

raw = pd.read_csv('./data/train.csv')

# selects columns with relevant data
data = raw[['Survived', 'Pclass',
            'Sex', 'Age', 'SibSp', 'Parch', 'Embarked', 'Fare']]
# drop rows that contain any missing values (891 rows become 714)
data = data.dropna()
# Add the 'binary_values' column to the 'data' DataFrame
data['binary_values'] = ""
data = formatData(data)

# TOTAL PASSENGERS = 714
total_passengers = data['Survived'].count()
# TOTAL SURVIVED = 290 (40.6%)
total_survived = data['Survived'].sum()
# TOTAL DEATHS = 424 (59.4%)
total_died = total_passengers - total_survived

binary_values_length = len(data.at[0, 'binary_values'])

chromosomes = generateChromossomes(
    chromosome_number=CHROMOSOME_NUMBER, binary_values_length=binary_values_length)

best_chromosome = None
best_fitness = float('-inf')

for iteration in range(ITERATIONS):
    fitnesses = []

    # Evaluates fitness for every chromosome
    for chromosome in chromosomes:
        fitness = chromosomeFitness(
            chromosome['coefficients'], data)
        fitnesses.append(fitness)

    # Generates parents
    fitness_sum = sum(fitnesses)
    fitnesses_percentages = [fitness / fitness_sum for fitness in fitnesses]

    best_index = fitnesses.index(max(fitnesses))
    if best_chromosome is None or fitnesses[best_index] > chromosomeFitness(best_chromosome['coefficients'], data):
        best_chromosome = chromosomes[best_index]

    # Update best_chromosome if a better chromosome is found
    if best_chromosome is None or fitnesses[best_index] > best_fitness:
        best_chromosome = chromosomes[best_index]
        best_fitness = fitnesses[best_index]

    average_fitness = sum(fitnesses) / len(fitnesses)
    print(iteration, "Best Fitness:",
          fitnesses[best_index], "Average Fitness:", average_fitness)

    # Generates parents using tournament selection
    parent_1 = tournamentSelection(
        chromosomes, fitnesses_percentages, tournament_size=3, data=data)
    parent_2 = tournamentSelection(
        chromosomes, fitnesses_percentages, tournament_size=3, data=data)

    while parent_2 == parent_1:
        parent_2 = tournamentSelection(
            chromosomes, fitnesses_percentages, tournament_size=3, data=data)

    parent_1_coefficients = parent_1['coefficients']
    parent_1_binary_values = parent_1['binary_values']

    parent_2_coefficients = parent_2['coefficients']
    parent_2_binary_values = parent_2['binary_values']

    parents_cut = [random.randint(1, binary_values_length)
                   for _ in range(CHILDREN_NUMBER)]
    parents_cut.sort()

    children = []

    for i in range(CHILDREN_NUMBER):
        child_coefficients = parent_1_coefficients[:parents_cut[i]
                                                   ] + parent_2_coefficients[parents_cut[i]:]
        child_binary_values = parent_1_binary_values[:parents_cut[i]
                                                     ] + parent_2_binary_values[parents_cut[i]:]
        child = {'coefficients': child_coefficients,
                 'binary_values': child_binary_values}

        # used_indeces ensures that the same coefficient is not mutated twice
        used_indices = set()

        for j in range(MUTATION_NUMBER):
            # Random mutation of one of the child coefficients
            mutation_coefficient_index = random.randint(
                0, len(child['coefficients']) - 1)

            while mutation_coefficient_index in used_indices:
                mutation_coefficient_index = random.randint(
                    0, len(child['coefficients']) - 1)

            used_indices.add(mutation_coefficient_index)

            child['coefficients'][mutation_coefficient_index] = random.uniform(
                -1, 1)

        child_fitness = chromosomeFitness(
            child['coefficients'], data)
        child['fitness'] = child_fitness

        children.append(child)

    best_children = sorted(children, key=lambda i: chromosomeFitness(
        i['coefficients'], data))[:CHILDREN_NUMBER]

    worst_indices = sorted(range(len(fitnesses)), key=lambda i: fitnesses[i])[
        :CHILDREN_NUMBER]
    for i, index in enumerate(worst_indices):
        chromosomes[index] = best_children[i]
print("Best Chromosome:", best_chromosome)


formatAndExportData(best_chromosome)