In [1]:
from __future__ import print_function
import os
import neat

import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
import torch.optim as optim


from explaneat.core.backprop import NeatNet
from explaneat.core import backprop
from explaneat.core.backproppop import BackpropPopulation
# from explaneat.visualization import visualize
from explaneat.core.experiment import ExperimentReporter
from explaneat.core.utility import one_hot_encode


from sklearn import datasets, metrics
from sklearn.preprocessing import StandardScaler, normalize, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from copy import deepcopy

import time
from datetime import datetime

In [2]:
def one_hot_encode(vals):
    width = max(vals)
    newVals = []
    for val in vals:
        blank = [0. for _ in range(width + 1)]
        blank[val] = 1.
        newVals.append(blank)
    return np.asarray(newVals)


In [3]:
RANDOM_SEED      = 42
NUMBER_OF_SPLITS = 10
SAVE_FILE_NAME   = './../../../data/uci/processed/results/adult/results_NEAT_{}.csv'.format(datetime.now())

In [7]:
adult_columns = [
    "age", 
    "workclass",
    "fnlwgt", 
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "gt50k"]
y_cols = 'gt50k'


# data = pd.read_csv('./../../../data/uci/processed/data/adult/adult.data',
data = pd.read_csv('./../../data/processed/adult.data',
                   names=adult_columns,
                  index_col=False)

x_cols = data.columns.values.tolist()
x_cols.remove(y_cols)

xs_raw = data[x_cols]
ys_raw = data[y_cols]

categorical_feature_mask = xs_raw.dtypes==object
numerical_feature_mask = xs_raw.dtypes=="int64"

categorical_cols = xs_raw.columns[categorical_feature_mask].tolist()
numerical_cols = xs_raw.columns[numerical_feature_mask].tolist()

scaler = StandardScaler()
xs = xs_raw.copy()

# OHE categoricals
onehotencoded = pd.get_dummies(xs_raw[categorical_cols])
xs[onehotencoded.columns] = onehotencoded
xs = xs.drop(categorical_cols, axis=1)

## Linear scaling
numericals = xs_raw[numerical_cols].values #returns a numpy array
scaler = StandardScaler()
numericals = scaler.fit_transform(xs_raw[numerical_cols].values)
xs[numerical_cols] = pd.DataFrame(numericals)


####

## Adjust outcome var
ys = data['gt50k'] == ' >50K'
ys = ys.apply(lambda x: 1 if x else 0)

In [8]:
def mark_result(model_name, random_seed, cross_fold_index, predictions, AUC_score):
    return {
        'modelName': model_name,
        'randomSeed': random_seed,
        'crossFoldIndex': cross_fold_index,
        'predictions': list(predictions),
        'auc':AUC_score
    }

In [9]:
kf = KFold(n_splits = NUMBER_OF_SPLITS, 
           random_state = RANDOM_SEED,
          shuffle=True)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(xs, ys, test_size=0.15, random_state=42)

# Performance

In [11]:
config_path = "./config-iris"
base_config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                     neat.DefaultSpeciesSet, neat.DefaultStagnation,
                     config_path)

In [12]:
base_config.pop_size

2

In [13]:
maxNGenerations = 5

In [14]:
def instantiate_population(config, xs, ys, saveLocation):

    if not os.path.exists(saveLocation):
        os.makedirs(saveLocation)
        
    config.save(os.path.join(saveLocation, 'config.conf'))

    # Create the population, which is the top-level object for a NEAT run.
    p = BackpropPopulation(config, 
                            xs, 
                            ys, 
                            criterion=nn.BCEWithLogitsLoss())

    # Add a stdout reporter to show progress in the terminal.
    p.add_reporter(neat.StdOutReporter(True))
    stats = neat.StatisticsReporter()
    p.add_reporter(stats)
    p.add_reporter(neat.Checkpointer(5, filename_prefix=str(saveLocation) + "checkpoint-" ))
    bpReporter = backprop.BackpropReporter(True)
    p.add_reporter(bpReporter)
    p.add_reporter(ExperimentReporter(saveLocation))
    
    return p

In [15]:
results = []

In [16]:
saveLocationTemplate = './../../data/experiments/adult/NEAT-performance-{}/'

In [17]:
y_train

19336    0
985      0
12675    0
15967    0
32031    0
        ..
29802    0
5390     0
860      0
15795    0
23654    0
Name: gt50k, Length: 27676, dtype: int64

In [18]:
y_train.reset_index(drop=True).to_numpy()

array([0, 0, 0, ..., 0, 0, 0])

# Train an ExplaNEAT Model

In [None]:
for index, (train_index, test_index) in enumerate(kf.split(xs)):
    print("Currently training {}".format(index))
    X_train, X_test = xs.iloc[train_index], xs.iloc[test_index]
    y_train, y_test = ys[train_index], ys[test_index]
    
    neat_x_train, neat_y_train = X_train.reset_index(drop=True).to_numpy(), y_train.reset_index(drop=True).to_numpy()

    
    def eval_genomes(genomes, config, ys):
        loss = nn.CrossEntropyLoss()
        for genome_id, genome in genomes:
            net = neat.nn.FeedForwardNetwork.create(genome, config)
            preds = []
            for xi in xs:
                preds.append(net.activate(X_train))
    #         genome.fitness = float(1./loss(torch.tensor(preds), torch.tensor(ys)))
            roc_auc_score(y_train, preds)
    
    start_time = datetime.now()

    print("################################################")
    print("################################################")
    print("Starting iteration {}".format(index))
    print("Started at {}".format(start_time.strftime("%m/%d/%Y, %H:%M:%S")))
    print("################################################")
    print("################################################")


    config = deepcopy(base_config)

    saveLocation = saveLocationTemplate.format(index)
    
    if not os.path.exists(saveLocation):
        os.makedirs(saveLocation)

    p = instantiate_population(config, neat_x_train, neat_y_train , saveLocation)
    # Run for up to nGenerations generations.
    winner = p.run(eval_genomes, maxNGenerations)

    g = p.best_genome


    end_time = datetime.now()

    p.reporters.reporters[2].save_checkpoint(p.config, p.population, p.species, str(p.generation) + "-final")  

    winner_net = neat.nn.FeedForwardNetwork.create(winner, config)

    results = []
    for xi, xo in zip(xs, ys):
        output = winner_net.activate(xi)
        results.append([xi[0], xi[1], xo, output])

    df = pd.DataFrame(results)
    df.to_csv(os.path.join(saveLocation, 'results.csv'))

    ancestry = p.reporters.reporters[3].trace_ancestry_of_species(g.key, p.reproduction.ancestors) 

    ancestors = {
        k: v['genome'] for k, v in p.reporters.reporters[3].ancestry.items()
    }

#         visualize.create_ancestry_video(p.config, 
#                                         g, 
#                                         ancestry, 
#                                         ancestors, 
#                                         p.reporters.reporters[1], 
#                                         pathname=saveLocation)
    print("################################################")
    print("################################################")
    print("Have finished population {} iteration {}".format(pop_size, iteration_no))
    print("Started at {}".format(start_time.strftime("%m/%d/%Y, %H:%M:%S")))
    print("The time is {}".format(end_time.strftime("%m/%d/%Y, %H:%M:%S")))
    print("################################################")
    print("################################################")

    
    
    
    # Use the forest's predict method on the test data
    rf_preds = rf.predict(X_test)
    # Calculate the absolute errors
    errors = abs(rf_preds - y_test)
    auc = roc_auc_score(y_test, rf_preds)
    results.append(mark_result('NEAT', RANDOM_SEED, index, rf_preds, auc))

Currently training 0
################################################
################################################
Starting iteration 0
Started at 07/05/2020, 15:34:33
################################################
################################################

 ****** Running generation 0 ****** 



In [None]:
res_df = pd.DataFrame(results)
res_df.to_csv(SAVE_FILE_NAME)

In [None]:
ys