In [44]:
from __future__ import print_function
import os
import neat

import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
import torch.optim as optim


# from explaneat.core.backprop import NeatNet
from explaneat.core.neuralneat import NeuralNeat as nneat
from explaneat.core import backprop
from explaneat.core.backproppop import BackpropPopulation
# from explaneat.visualization import visualize
from explaneat.core.experiment import ExperimentReporter
from explaneat.core.utility import one_hot_encode


from sklearn import datasets, metrics
from sklearn.preprocessing import StandardScaler, normalize, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from copy import deepcopy

import time
from datetime import datetime

In [45]:

USE_CUDA = torch.cuda.is_available()
USE_CUDA = False
device = torch.device("cuda:1" if USE_CUDA else "cpu")


In [46]:
def one_hot_encode(vals):
    width = max(vals)
    newVals = []
    for val in vals:
        blank = [0. for _ in range(width + 1)]
        blank[val] = 1.
        newVals.append(blank)
    return np.asarray(newVals)


In [47]:
RANDOM_SEED      = 42
NUMBER_OF_SPLITS = 10
SAVE_FILE_NAME   = './../../../data/uci/processed/results/adult/results_NEAT_{}.csv'.format(datetime.now())

In [48]:
adult_columns = [
    "age", 
    "workclass",
    "fnlwgt", 
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "gt50k"]
y_cols = 'gt50k'


# data = pd.read_csv('./../../../data/uci/processed/data/adult/adult.data',
data = pd.read_csv('./../../data/processed/adult.data',
                   names=adult_columns,
                  index_col=False)

x_cols = data.columns.values.tolist()
x_cols.remove(y_cols)

xs_raw = data[x_cols]
ys_raw = data[y_cols]

categorical_feature_mask = xs_raw.dtypes==object
numerical_feature_mask = xs_raw.dtypes=="int64"

categorical_cols = xs_raw.columns[categorical_feature_mask].tolist()
numerical_cols = xs_raw.columns[numerical_feature_mask].tolist()

scaler = StandardScaler()
xs = xs_raw.copy()

# OHE categoricals
onehotencoded = pd.get_dummies(xs_raw[categorical_cols])
xs[onehotencoded.columns] = onehotencoded
xs = xs.drop(categorical_cols, axis=1)

## Linear scaling
numericals = xs_raw[numerical_cols].values #returns a numpy array
scaler = StandardScaler()
numericals = scaler.fit_transform(xs_raw[numerical_cols].values)
xs[numerical_cols] = pd.DataFrame(numericals)


####

## Adjust outcome var
ys = data['gt50k'] == ' >50K'
ys = ys.apply(lambda x: 1 if x else 0)

In [49]:
def mark_result(model_name, random_seed, cross_fold_index, predictions, AUC_score):
    return {
        'modelName': model_name,
        'randomSeed': random_seed,
        'crossFoldIndex': cross_fold_index,
        'predictions': list(predictions),
        'auc':AUC_score
    }

In [50]:
kf = KFold(n_splits = NUMBER_OF_SPLITS, 
           random_state = RANDOM_SEED,
          shuffle=True)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(xs, ys, test_size=0.15, random_state=42)

# Performance

In [52]:
config_path = "./config-iris"
base_config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction,
                     neat.DefaultSpeciesSet, neat.DefaultStagnation,
                     config_path)

In [53]:
base_config.pop_size

2

In [54]:
maxNGenerations = 5

In [55]:
def instantiate_population(config, xs, ys, saveLocation):

    if not os.path.exists(saveLocation):
        os.makedirs(saveLocation)
        
    config.save(os.path.join(saveLocation, 'config.conf'))

    # Create the population, which is the top-level object for a NEAT run.
    p = BackpropPopulation(config, 
                            xs, 
                            ys, 
                            criterion=nn.BCEWithLogitsLoss())

    # Add a stdout reporter to show progress in the terminal.
    p.add_reporter(neat.StdOutReporter(True))
    stats = neat.StatisticsReporter()
    p.add_reporter(stats)
    p.add_reporter(neat.Checkpointer(5, filename_prefix=str(saveLocation) + "checkpoint-" ))
    bpReporter = backprop.BackpropReporter(True)
    p.add_reporter(bpReporter)
    p.add_reporter(ExperimentReporter(saveLocation))
    
    return p

In [56]:
results = []

In [57]:
saveLocationTemplate = './../../data/experiments/adult/newneat-performance-{}/'

In [58]:
y_train

19336    0
985      0
12675    0
15967    0
32031    0
        ..
29802    0
5390     0
860      0
15795    0
23654    0
Name: gt50k, Length: 27676, dtype: int64

In [59]:
y_train.reset_index(drop=True).to_numpy()

array([0, 0, 0, ..., 0, 0, 0])

# Train an ExplaNEAT Model

In [64]:
len(neat_x_train)

29304

In [69]:
for index, (train_index, test_index) in enumerate(kf.split(xs[:10000])):
    print("Currently training {}".format(index))
    X_train, X_test = xs.iloc[train_index], xs.iloc[test_index]
    y_train, y_test = ys[train_index], ys[test_index]
    
    neat_x_train, neat_y_train = X_train.reset_index(drop=True).to_numpy(), y_train.reset_index(drop=True).to_numpy()

    
    def eval_genomes(genomes, config):
        loss = nn.CrossEntropyLoss()
        for genome_id, genome in genomes.items():
            net = neat.nn.FeedForwardNetwork.create(genome, config)
            preds = net.activate(X_train)
            # preds = []
            # for xi in X_train:
                # preds.append(net.activate(xi))
    #         genome.fitness = float(1./loss(torch.tensor(preds), torch.tensor(ys)))
            roc_auc_score(y_train, preds)
    
    start_time = datetime.now()

    print("################################################")
    print("################################################")
    print("Starting iteration {}".format(index))
    print("Started at {}".format(start_time.strftime("%m/%d/%Y, %H:%M:%S")))
    print("################################################")
    print("################################################")


    config = deepcopy(base_config)

    saveLocation = saveLocationTemplate.format(index)
    
    if not os.path.exists(saveLocation):
        os.makedirs(saveLocation)

    p = instantiate_population(config, neat_x_train, neat_y_train , saveLocation)
    # Run for up to nGenerations generations.
    winner = p.run(eval_genomes, maxNGenerations,nEpochs=10)

    g = p.best_genome


    end_time = datetime.now()

    p.reporters.reporters[2].save_checkpoint(p.config, p.population, p.species, str(p.generation) + "-final")  

    winner_net = neat.nn.FeedForwardNetwork.create(winner, config)

    results = []
    for xi, xo in zip(xs, ys):
        output = winner_net.activate(xi)
        results.append([xi[0], xi[1], xo, output])

    df = pd.DataFrame(results)
    df.to_csv(os.path.join(saveLocation, 'results.csv'))

    ancestry = p.reporters.reporters[3].trace_ancestry_of_species(g.key, p.reproduction.ancestors) 

    ancestors = {
        k: v['genome'] for k, v in p.reporters.reporters[3].ancestry.items()
    }

#         visualize.create_ancestry_video(p.config, 
#                                         g, 
#                                         ancestry, 
#                                         ancestors, 
#                                         p.reporters.reporters[1], 
#                                         pathname=saveLocation)
    print("################################################")
    print("################################################")
    print("Have finished population {} iteration {}".format(pop_size, iteration_no))
    print("Started at {}".format(start_time.strftime("%m/%d/%Y, %H:%M:%S")))
    print("The time is {}".format(end_time.strftime("%m/%d/%Y, %H:%M:%S")))
    print("################################################")
    print("################################################")

    
    
    
    # Use the forest's predict method on the test data
    rf_preds = rf.predict(X_test)
    # Calculate the absolute errors
    errors = abs(rf_preds - y_test)
    auc = roc_auc_score(y_test, rf_preds)
    results.append(mark_result('NEAT', RANDOM_SEED, index, rf_preds, auc))

Currently training 0
################################################
################################################
Starting iteration 0
Started at 04/13/2022, 18:34:29
################################################
################################################
The function - generationStart - has just started at 1649831669.284058

 ****** Running generation 0 ****** 

The function - generationStart - took 4.982948303222656e-05 seconds to complete
The function - pre_backprop - has just started at 1649831669.284124
The function - pre_backprop - took 2.3126602172851562e-05 seconds to complete
The function - backprop - has just started at 1649831669.284158
about to start backprop with 10 epochs
mean improvement: -0.018321163222223436
best improvement: -0.020964875903248803
best loss: 0.5322243736275976
The function - backprop - took 6.872208118438721 seconds to complete
The function - post_backprop - has just started at 1649831676.156383
The function - post_backprop - took 2.09808

RuntimeError: Expected 108 inputs, got 9000

In [None]:
res_df = pd.DataFrame(results)
res_df.to_csv(SAVE_FILE_NAME)

In [None]:
ys

In [70]:
net = nneat(p.population[1], p.config, criterion=nn.BCEWithLogitsLoss())

OUTPUT
1
{0: array([[ 0.83710898, -1.008707  ,  1.13473876, ...,  1.        ,
         0.        ,  0.        ],
       [-0.04264203,  0.2450785 , -0.42005962, ...,  1.        ,
         0.        ,  0.        ],
       [-0.77576787,  1.40817572,  1.13473876, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.28895595,  1.22583276, -0.03136003, ...,  1.        ,
         0.        ,  0.        ],
       [-0.11595461, -0.59721789, -3.1409568 , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.03067056,  1.05913192, -0.03136003, ...,  1.        ,
         0.        ,  0.        ]])}
---------------
{1: {'nodes': {0: {'depth': 1, 'output_ids': [], 'input_ids': [-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32, -33, -34, -35, -36, -37, -38, -39, -40, -41, -42, -43, -44, -45, -46, -47, -48, -49, -50, -51, -52, -53, -54, -55, -56, -57, -58, -59, -60, -61

TypeError: expected Tensor as element 0 in argument 0, but got numpy.ndarray