In [1]:
import importlib
import numpy as np
import time
import string

In [2]:
import dna_data
from dna_data import DNA # To get this to work, needed to put 'if name == main' at bottom of dna.py 
importlib.reload(dna_data)

<module 'dna_data' from '/home/mjmrose/workspace/sbox-mjmrose/Bids/genAlgo/dna_data.py'>

# Class Definition

In [12]:
# A class to describe a population of virtual organisms
# In this case, each organism is just an instance of a DNA object

class Population:
    def __init__(self, data, preproc_algos, models, tgt, mut_rate, pop_sz, fit_exp, eval_perc, mating_pool_retain_perc, replace=True, midpt=False, verbose=False, debug=False):

        self.population = [] 
        self.mating_pool = [] 
        self.generations = 0
        self.evaluations = 0
        self.finished = False 
        self.mut_rate = mut_rate
        self.perfect_score = 0.3
        self.best = ""
        self.fitness_sum = 0
        self.fit_exp = fit_exp # Raise fitness to this power to increase (if > 1) prob. of higher fitness members breeding
        self.eval_perc = eval_perc/100 # % of members of population to evaluate and replace for 2nd+ generation
        self.mating_pool_retain_perc = mating_pool_retain_perc/100 # % of top fitness mems in mating pool to NOT replace
        self.replace_bool = replace # True: replace 'num_new_mems' mems of pop w/ lowest fitness w/ new children // False: new children will be appended to existing population
        self.verbose = verbose
        self.debug = debug
        self.eval_idxs = []
        
        # If not all mems of pop are being evaluated in each generation, can't use fitness for breeding
        if self.eval_perc != 1: self.midpt_bool = False # True: Choose point to split mems being bred // False: probabilistically select gene from one of mems being bred based on fitness
        else: self.midpt_bool = False
            
        # NEW - 10/10
        if len(preproc_algos) != len(data): self.preproc_algos = []
        else: self.preproc_algos = preproc_algos
            
        # Creating dictionary of dataframes to avoid passing actual df to each DNA instance
        self.data_dict = {}
        if (type(data) == list) & (len(data) > 0):
            count = 0
            for df in data:
                self.data_dict[count] = df.copy()
                count += 1
            del data
        else: raise Exception("No data passed or data passed was not in list form. Need at least dataframe in a list.")
        
        if (type(models) == list) & ( len(models) > 0 ): self.models = models
        else: raise Exception("No model passed or model(s) passed was not in list form. Need at least one model in a list.")
            
        self.tgt = tgt # array of ground truths we are trying to predict
                
        for i in range(pop_sz):
            self.population.append( DNA( list(self.data_dict.keys()), self.preproc_algos, self.models, verbose=self.verbose ) )

            
    def calc_fitness(self):
        '''
        Calculates fitness for every member of the population, exponentiates the fitness and calculates population sum
        '''
        if self.debug: print("start of calc_fitness")
        self.fitness_sum = 0
    
        # For initial population, calculate fitness for every member before generating new members
        if self.generations == 0:
            
            self.eval_idxs = [i for i in range(len(self.population))]
            
            for i in self.eval_idxs:
                self.population[i].calc_fitness(self.data_dict, self.tgt)
                self.evaluations += 1
                print(f"Pre exp fitness: {self.population[i].fitness}")
                self.population[i].fitness = self.population[i].fitness**self.fit_exp
                print(f"Post exp fitness: {self.population[i].fitness}")
                self.fitness_sum += self.population[i].fitness
                print(f"fitness sum: {self.fitness_sum}")
                
        # For all following generations, evaluate only self.eval_perc % of members
        else:
            if self.eval_perc == 1: self.eval_idxs = [i for i in range(len(self.population))]
                
            # If not evaluating all mems of pop, randomly select eval_perc * len(pop) mems to eval
            else:
                self.eval_idxs = []
                while len(self.eval_idxs) < self.eval_perc * len(self.population):
                    self.eval_idxs.append(np.random.randint(0, len(self.population)))
                    self.eval_idxs = list(set(self.eval_idxs))
                    
            for i in self.eval_idxs:
                self.population[i].calc_fitness(self.data_dict, self.tgt)
                self.evaluations += 1
                self.population[i].fitness = self.population[i].fitness**self.fit_exp
                self.fitness_sum += self.population[i].fitness

            
    def gen_mating_pool(self):
        '''
        Generates mating pool as sorted list of tuples (pop_idx, exponentiated_fitness) w/ highest scoring mems first
        '''
        if self.debug: print("start of gen_mating_pool")
        self.mating_pool = []
    
        for i in range( len(self.eval_idxs) ): 
    
            # Appending (idx, normalized fitness) for each idx in eval_idxs
            self.mating_pool.append( (self.eval_idxs[i], self.population[self.eval_idxs[i]].fitness / max(self.fitness_sum, 1e-4) ) )
    
        # Sorting by fitness score in descending order
        self.mating_pool.sort(reverse=True, key = lambda x : x[1])
        
    
    def pick_mem_from_mating_pool(self):
        '''
        Selects a member (mem) from the mating pool to participate in crossover.

        Steps for selection:
            1). Draw random number b/w 0-1 (val)
            2). Subtract normalized fitness of 1st mem of mating pool (highest fitness) from val
            3). If val is now negative, mem of pop corresponding to first mem of mating pool is selected for crossover.
            4). If val is still positive, move to 2nd mem of mating pool and repeat until val is negative
        '''
        if self.debug: print("Start of pick_mem_from_mating_pool")
        val = np.random.random()
        
        if self.verbose: print(f"val: {val}")
        if self.verbose: print(f"mating pool: {self.mating_pool}")
        print(f"val: {val}")
        print(f"mating pool: {self.mating_pool}")
            
        for i in range( len(self.mating_pool) ):
            val -= self.mating_pool[i][1]
            if val < 0:
                break
        if self.verbose: print(f"idx of mating pool: {self.mating_pool[i][0]}")
        return self.mating_pool[i][0]
    
    def pick_mem_from_population(self):
        '''
        Randomly selects a member (mem) from the population to participate in crossover.
        '''            
        pop_idx = np.random.randint(0, len(self.population))
        return pop_idx
    
    
    def gen_new_pop(self):
        '''
        Generates new members (mems) of population by probabilistically mating existing mems in the mating pool.
        Mems of the mating pool w/ higher fitness are more likely to be selected for mating.
        '''
        if self.debug: print("Start of gen_new_pop")
        children = []
        
        ### Breeding children ###
        # For every member we have evaluated, we need to breed a replacement
        num_children = int( len(self.eval_idxs) * (1 - self.mating_pool_retain_perc) )
        print(f"Generating {num_children} children")
        for i in range( num_children ): 
                
            # Selecting members to be bred
            idx1 = self.pick_mem_from_mating_pool()
            if self.eval_perc == 1: idx2 = self.pick_mem_from_mating_pool()
            else: idx2 = self.pick_mem_from_mating_pool()
            
            #THIS WAS NEW --> would want with low mutation rate
            # Sampling a random (top 10 fitness) member if idx1 == idx2, w/ 50% probability
            if (idx1 == idx2) & (np.random.random() > 0.5): 
                rand_hi_idx = np.random.randint( min(10, len(self.mating_pool) ) )
                idx2 = self.mating_pool[rand_hi_idx][0]
                if self.verbose: print("idx1 == idx2 dealt with")
            
            partnerA = self.population[idx1]
            partnerB = self.population[idx2]
            
#             if self.verbose: print(f"PartnerA: {partnerA.get_phrase()}")
#             if self.verbose: print(f"PartnerB: {partnerB.get_phrase()}")
    
            child = partnerA.crossover(partnerB, midpt_bool=self.midpt_bool)
            child.mutate(self.mut_rate)
#             if self.verbose: print(f"Child: {child.get_phrase()}")
            children.append(child)

        ### Updating population ###
        if self.replace_bool:    
            if self.eval_perc == 1:
                for i in range(len(children)):
                    replace_idx = self.mating_pool[len(self.mating_pool) - i - 1][0]
                    self.population[replace_idx] = children[i] # Overwrites self.population[replace_idx].fitness w/ 0
            else: # Can use mating pool for replacements if it is not too small
                for i in range(len(children)):
#                     replace_idx = self.eval_idxs[i]
                    replace_idx = self.mating_pool[len(self.mating_pool) - i - 1][0]
                    self.population[replace_idx] = children[i] # Overwrites self.population[replace_idx].fitness w/ 0
        else:
            self.population.extend(children)

        self.generations += 1

    def evaluate(self):
        '''
        Computes the current "most fit" member of the population and whether the perfect score has been achieved.
        '''
        world_record = 0
        idx = 0
        for i in range(len(self.population)): 
            if self.population[i].fitness > world_record:
                idx = i
                world_record = self.population[i].fitness
                
        print(f"World Record: {world_record**(1/self.fit_exp)}")

        self.best = self.population[idx].get_genes()
        if world_record**(1/self.fit_exp) == self.perfect_score:
            self.finished = True
            
        print(f"Best: {self.get_best()}")
        print(f"Average: {self.get_average_fitness()}")

        # If we found the target phrase, stop
        if self.is_finished():
            print("We did it :)")
            print(f"Result: {self.get_best()}")
            print(f"Num gens: {self.get_generations()}")
            print(f"Num evals: {self.get_evaluations()}")

    def is_finished(self):
        return self.finished
    
    def get_best(self):
        return self.best
    
    def get_evaluations(self):
        return self.evaluations

    def get_generations(self):
        return self.generations

    def get_average_fitness(self):
        total = 0
        for i in range( len( self.population ) ):
            total += self.population[i].fitness**(1/self.fit_exp)
        return total / len(self.population)
    
    
    def evolve(self):
    
        # Calculate fitness for each mem of pop, take fitness**fit_exp and calc fitness sum
        self.calc_fitness()
        
        # Compute most fit mem of pop and determine if finished
        self.evaluate()

        # Generate mating pool array by sorting normalized fitness values
        self.gen_mating_pool()

        # Generate new population mems by crossover b/w existing mems of mating pool
        # Either replace existing mems or add new mems to pop
        self.gen_new_pop()

### Initializing a population

In [17]:
from sklearn import datasets, linear_model

# Load the diabetes dataset
diabetes = datasets.load_diabetes()
diabetes_X = diabetes.data
diabetes_y = diabetes.target

In [14]:
import dna_data_v1
from dna_data_v1 import DNA # To get this to work, needed to put 'if name == main' at bottom of dna.py 
importlib.reload(dna_data_v1)

<module 'dna_data_v1' from '/home/mjmrose/workspace/sbox-mjmrose/Bids/genAlgo/dna_data_v1.py'>

In [15]:
data = [diabetes_X[:, : int(diabetes_X.shape[1] / 2)], diabetes_X[:, int(diabetes_X.shape[1] / 2) : ]]
preproc_algos = []
models = ['rf', 'lr']
tgt = diabetes_y
pop_sz = 5
eval_perc = 50
mating_pool_retain_perc = 10
mut_rate = 0.05
fit_exp = 2
pop = Population(data, preproc_algos, models, tgt, mut_rate, pop_sz, fit_exp, eval_perc, mating_pool_retain_perc, verbose=False)

data: [0, 1]
self.genes['data']: [0, 1]
data: [0, 1]
self.genes['data']: [0, 1]
data: [0, 1]
self.genes['data']: [0, 1]
data: [0, 1]
self.genes['data']: [0, 1]
data: [0, 1]
self.genes['data']: [0, 1]


# Testing speed of convergence w/ gene selection based on fitness

In [16]:
start = time.time()
for i in range(2000):
    pop.evolve()
    if pop.is_finished():
        break
print(f"\nTime: {time.time() - start}")


Performing randomized search
Best score: 0.445
Best parameters set:
	max_depth: 3
	max_features: 6
	min_samples_leaf: 17
	n_estimators: 15

R2 for test_preds: 0.3694837382039543

 test_preds head: [139.2079402  235.78503829 111.60372938 105.45923691 250.74170049]
Linear regression

R2 for test_preds: 0.4399387660024645

 test_preds head: [154.1235067  204.81721599 124.92988001 106.09339576 258.53035681]

pred_array.shape: (2, 89)

pred_array head: [[139.2079402  154.1235067 ]
 [235.78503829 204.81721599]
 [111.60372938 124.92988001]
 [105.45923691 106.09339576]
 [250.74170049 258.53035681]]
pred_array.shape post-processing: (89, 2)

y_te[:5]: [ 73. 233.  97. 111. 277.]
ens_preds.shape: (62, 2)
eval_preds.shape: (27, 2)
ens_labels.shape: (62,)
eval_labels.shape: (27,)

Performing grid search
Best score: 0.196
	alpha: 1
	l1_ratio: 1

el_net coefficients: [0.         0.78140295]
ens_eval_preds.shape: (27,)
eval_labels.shape: (27,)

Score: 0.18192841093783096

Pre exp fitness: 0.181928410

Best score: 0.196
	alpha: 1
	l1_ratio: 1

el_net coefficients: [0.         0.78140295]
ens_eval_preds.shape: (27,)
eval_labels.shape: (27,)

Score: 0.2408481450163651


Performing randomized search
Best score: 0.470
Best parameters set:
	max_depth: 11
	max_features: 5
	min_samples_leaf: 14
	n_estimators: 20

R2 for test_preds: 0.362259124302165

 test_preds head: [158.14772744 213.80443458 100.72595111  91.64647206 261.34250574]

pred_array.shape: (1, 89)

pred_array head: [[158.14772744]
 [213.80443458]
 [100.72595111]
 [ 91.64647206]
 [261.34250574]]
pred_array.shape post-processing: (89, 1)

y_te[:5]: [ 73. 233.  97. 111. 277.]
ens_preds.shape: (62, 1)
eval_preds.shape: (27, 1)
ens_labels.shape: (62,)
eval_labels.shape: (27,)

Performing grid search
Best score: 0.140
	alpha: 1
	l1_ratio: 1

el_net coefficients: [0.73393863]
ens_eval_preds.shape: (27,)
eval_labels.shape: (27,)

Score: 0.24200988770239795

World Record: 0.24200988770239795
Best: {'Data': [0, 1], 'Preprocessing': [], '

Best score: 0.472
Best parameters set:
	max_depth: 5
	max_features: 6
	min_samples_leaf: 12
	n_estimators: 24

R2 for test_preds: 0.3926571524385871

 test_preds head: [138.43175528 218.72328227 108.53040921  84.67408824 260.4245769 ]
Linear regression

R2 for test_preds: 0.4399387660024645

 test_preds head: [154.1235067  204.81721599 124.92988001 106.09339576 258.53035681]

pred_array.shape: (2, 89)

pred_array head: [[138.43175528 154.1235067 ]
 [218.72328227 204.81721599]
 [108.53040921 124.92988001]
 [ 84.67408824 106.09339576]
 [260.4245769  258.53035681]]
pred_array.shape post-processing: (89, 2)

y_te[:5]: [ 73. 233.  97. 111. 277.]
ens_preds.shape: (62, 2)
eval_preds.shape: (27, 2)
ens_labels.shape: (62,)
eval_labels.shape: (27,)

Performing grid search
Best score: 0.196
	alpha: 1
	l1_ratio: 1

el_net coefficients: [0.         0.78140295]
ens_eval_preds.shape: (27,)
eval_labels.shape: (27,)

Score: 0.19429122421339318


Performing randomized search
Best score: 0.462
Best param

Process ForkPoolWorker-2659:
Process ForkPoolWorker-2658:
Process ForkPoolWorker-2653:
Process ForkPoolWorker-2651:
Process ForkPoolWorker-2655:
Process ForkPoolWorker-2660:
Process ForkPoolWorker-2664:
Process ForkPoolWorker-2654:
Process ForkPoolWorker-2657:
Process ForkPoolWorker-2656:
Process ForkPoolWorker-2650:
Process ForkPoolWorker-2652:
Process ForkPoolWorker-2663:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call 

KeyboardInterrupt: 

In [None]:
import pandas as pd
test = pd.DataFrame(data = {'x':[1,2,3]})

In [13]:
test2 = pd.DataFrame(data = {'y':[4,2,3]})

In [16]:
z = {'test':test, 'test2':test2}

In [94]:
tuple(list(z.keys()))

('test', 'test2')

In [17]:
[z[key] for key in z.keys()]

[   x
 0  1
 1  2
 2  3,    y
 0  4
 1  2
 2  3]

In [18]:
pd.concat([z[key] for key in z.keys()], axis=1)

Unnamed: 0,x,y
0,1,4
1,2,2
2,3,3


In [15]:
pd.concat([test,test2], axis=1)

Unnamed: 0,x,y
0,1,4
1,2,2
2,3,3


In [97]:
a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6]])
np.concatenate((a, b.T, b.T), axis=1)

array([[1, 2, 5, 5],
       [3, 4, 6, 6]])

In [201]:
b = np.array([5,6])
c = np.array([5,6])

In [202]:
b.shape == c.shape

True