In [1]:
import importlib
import numpy as np
import time
import string
from sklearn import datasets

In [2]:
# import dna_data
# from dna_data import DNA # To get this to work, needed to put 'if name == main' at bottom of dna.py 
# importlib.reload(dna_data)

# Class Definition

In [10]:
# A class to describe a population of virtual organisms
# In this case, each organism is just an instance of a DNA object

class Population:
    def __init__(self, data, preproc_algos, models, ens_methods, tgt, mut_rate, pop_sz, fit_exp, eval_perc, mating_pool_retain_perc, replace=True, midpt=False, verbose=False, debug=False):

        self.population = [] 
        self.mating_pool = [] 
        self.generations = 0
        self.evaluations = 0
        self.finished = False 
        self.mut_rate = mut_rate
        self.perfect_score = 0.43
        self.best = ""
        self.fitness_sum = 0
        self.fit_exp = fit_exp # Raise fitness to this power to increase (if > 1) prob. of higher fitness members breeding
        self.eval_perc = eval_perc/100 # % of members of population to evaluate and replace for 2nd+ generation
        self.mating_pool_retain_perc = mating_pool_retain_perc/100 # % of top fitness mems in mating pool to NOT replace
        self.replace_bool = replace # True: replace 'num_new_mems' mems of pop w/ lowest fitness w/ new children // False: new children will be appended to existing population
        self.verbose = verbose
        self.debug = debug
        self.eval_idxs = []
        
        # If not all mems of pop are being evaluated in each generation, can't use fitness for breeding
        if self.eval_perc != 1: self.midpt_bool = False # True: Choose point to split mems being bred // False: probabilistically select gene from one of mems being bred based on fitness
        else: self.midpt_bool = False
            
        # NEW - 10/10
        if len(preproc_algos) != len(data): self.preproc_algos = []
        else: self.preproc_algos = preproc_algos
            
        # Creating dictionary of dataframes to avoid passing actual df to each DNA instance
        self.data_dict = {}
        if (type(data) == list) & (len(data) > 0):
            count = 0
            for df in data:
                self.data_dict[count] = df.copy()
                count += 1
            del data
        else: raise Exception("No data passed or data passed was not in list form. Need at least dataframe in a list.")
        
        # Ensuring models were passed in correct format
        if (type(models) == list) & ( len(models) > 0 ): self.models = models
        else: raise Exception("No model passed or model(s) passed was not in list form. Need at least one model in a list.")
            
        self.tgt = tgt # array of ground truths we are trying to predict (i.e. y)
                
        for i in range(pop_sz):
            self.population.append( DNA( list(self.data_dict.keys()), self.preproc_algos, self.models, ens_methods, verbose=self.verbose ) )

            
    def calc_fitness(self):
        '''
        Calculates fitness for every member of the population, exponentiates the fitness and calculates population sum
        '''
        if self.debug: print("start of calc_fitness")
        self.fitness_sum = 0
    
        # For initial population, calculate fitness for every member before generating new members
        if self.generations == 0:
            
            self.eval_idxs = [i for i in range(len(self.population))]
            
            for i in self.eval_idxs:
                self.population[i].calc_fitness(self.data_dict, self.tgt)
                self.evaluations += 1
                print(f"Pre exp fitness: {self.population[i].fitness}")
                self.population[i].fitness = self.population[i].fitness**self.fit_exp
                print(f"Post exp fitness: {self.population[i].fitness}")
                self.fitness_sum += self.population[i].fitness
                print(f"fitness sum: {self.fitness_sum}")
                
        # For all following generations, evaluate only self.eval_perc % of members
        else:
            if self.eval_perc == 1: self.eval_idxs = [i for i in range(len(self.population))]
                
            # If not evaluating all mems of pop, randomly select eval_perc * len(pop) mems to eval
            else:
                self.eval_idxs = []
                while len(self.eval_idxs) < self.eval_perc * len(self.population):
                    self.eval_idxs.append(np.random.randint(0, len(self.population)))
                    self.eval_idxs = list(set(self.eval_idxs))
                    
            for i in self.eval_idxs:
                self.population[i].calc_fitness(self.data_dict, self.tgt)
                self.evaluations += 1
                self.population[i].fitness = self.population[i].fitness**self.fit_exp
                self.fitness_sum += self.population[i].fitness

            
    def gen_mating_pool(self):
        '''
        Generates mating pool as sorted list of tuples (pop_idx, exponentiated_fitness) w/ highest scoring mems first
        '''
        if self.debug: print("start of gen_mating_pool")
        self.mating_pool = []
    
        for i in range( len(self.eval_idxs) ): 
    
            # Appending (idx, normalized fitness) for each idx in eval_idxs
            self.mating_pool.append( (self.eval_idxs[i], self.population[self.eval_idxs[i]].fitness / max(self.fitness_sum, 1e-4) ) )
    
        # Sorting by fitness score in descending order
        self.mating_pool.sort(reverse=True, key = lambda x : x[1])
        
    
    def pick_mem_from_mating_pool(self):
        '''
        Selects a member (mem) from the mating pool to participate in crossover.

        Steps for selection:
            1). Draw random number b/w 0-1 (val)
            2). Subtract normalized fitness of 1st mem of mating pool (highest fitness) from val
            3). If val is now negative, mem of pop corresponding to first mem of mating pool is selected for crossover.
            4). If val is still positive, move to 2nd mem of mating pool and repeat until val is negative
        '''
        if self.debug: print("Start of pick_mem_from_mating_pool")
        val = np.random.random()
        
        if self.verbose: print(f"val: {val}")
        if self.verbose: print(f"mating pool: {self.mating_pool}")
#         print(f"val: {val}")
#         print(f"mating pool: {self.mating_pool}")
            
        for i in range( len(self.mating_pool) ):
            val -= self.mating_pool[i][1]
            if val < 0:
                break
        if self.verbose: print(f"idx of mating pool: {self.mating_pool[i][0]}")
        return self.mating_pool[i][0]
    
    def pick_mem_from_population(self):
        '''
        Randomly selects a member (mem) from the population to participate in crossover.
        '''            
        pop_idx = np.random.randint(0, len(self.population))
        return pop_idx
    
    
    def gen_new_pop(self):
        '''
        Generates new members (mems) of population by probabilistically mating existing mems in the mating pool.
        Mems of the mating pool w/ higher fitness are more likely to be selected for mating.
        '''
        if self.debug: print("Start of gen_new_pop")
        children = []
        
        ### Breeding children ###
        # For every member we have evaluated, we need to breed a replacement
        num_children = int( len(self.eval_idxs) * (1 - self.mating_pool_retain_perc) )
        print(f"Generating {num_children} children")
        for i in range( num_children ): 
                
            # Selecting members to be bred
            idx1 = self.pick_mem_from_mating_pool()
            if self.eval_perc == 1: idx2 = self.pick_mem_from_mating_pool()
            else: idx2 = self.pick_mem_from_mating_pool()
            
            #THIS WAS NEW --> would want with low mutation rate
            # Sampling a random (top 10 fitness) member if idx1 == idx2, w/ 50% probability
            if (idx1 == idx2) & (np.random.random() > 0.5): 
                rand_hi_idx = np.random.randint( min(10, len(self.mating_pool) ) )
                idx2 = self.mating_pool[rand_hi_idx][0]
                if self.verbose: print("idx1 == idx2 dealt with")
            
            partnerA = self.population[idx1]
            partnerB = self.population[idx2]
            
            # Crossover & mutation --> mutants (below) are initialized w/ empty genes for non-[models, data, preproc] genes
            child = partnerA.crossover(partnerB, midpt_bool=self.midpt_bool)
            child.mutate(self.mut_rate, self.models)
            children.append(child)

        ### Updating population ###
        if self.replace_bool:    
            if self.eval_perc == 1:
                for i in range(len(children)):
                    replace_idx = self.mating_pool[len(self.mating_pool) - i - 1][0]
                    self.population[replace_idx] = children[i] # Overwrites self.population[replace_idx].fitness w/ 0
            else: # Can use mating pool for replacements if it is not too small
                for i in range(len(children)):
                    replace_idx = self.mating_pool[len(self.mating_pool) - i - 1][0]
                    self.population[replace_idx] = children[i] # Overwrites self.population[replace_idx].fitness w/ 0
        else:
            self.population.extend(children)

        self.generations += 1

    def evaluate(self):
        '''
        Computes the current "most fit" member of the population and whether the perfect score has been achieved.
        '''
        world_record = 0
        idx = 0
        for i in range(len(self.population)): 
            if self.population[i].fitness > world_record:
                idx = i
                world_record = self.population[i].fitness
                
        print(f"World Record: {world_record**(1/self.fit_exp)}\n")

        self.best = self.population[idx]
        
        for dna in self.population:
            print({'Data':dna.genes['data'], 'Preprocessing':dna.genes['preproc'], 'Models':dna.genes['models']})
        
        if world_record**(1/self.fit_exp) >= self.perfect_score:
            self.finished = True
            
        print(f"\nBest: {self.get_best()}")
        print(f"Average: {self.get_average_fitness()}")

        # If we found the target phrase, stop
        if self.is_finished():
            print("\nWe did it :)")
            print(f"Result: {self.get_best(get_all=True)}")
            print(f"\nNum gens: {self.get_generations()}")
            print(f"Num evals: {self.get_evaluations()}")

    def is_finished(self):
        return self.finished
    
    def get_best(self, get_all = False):
        return self.best.get_genes(get_all = get_all)
    
    def get_evaluations(self):
        return self.evaluations

    def get_generations(self):
        return self.generations

    def get_average_fitness(self):
        total = 0
        for i in range( len( self.population ) ):
            total += self.population[i].fitness**(1/self.fit_exp)
        return total / len(self.population)
    
    
    def evolve(self):
    
        # Calculate fitness for each mem of pop, take fitness**fit_exp and calc fitness sum
        self.calc_fitness()
        
        # Compute most fit mem of pop and determine if finished
        self.evaluate()
        
        if not self.finished:

            # Generate mating pool array by sorting normalized fitness values
            self.gen_mating_pool()

            # Generate new population mems by crossover b/w existing mems of mating pool
            # Either replace existing mems or add new mems to pop
            self.gen_new_pop()

### Loading toy datasets

In [11]:
# Load the diabetes dataset
diabetes = datasets.load_diabetes()
diabetes_X = diabetes.data
diabetes_y = diabetes.target

In [12]:
feat_cuts = [int(diabetes_X.shape[1] / 4), int(diabetes_X.shape[1] / 2), int(diabetes_X.shape[1] * (3/4))]
data = [diabetes_X[:, : feat_cuts[0]], diabetes_X[:, feat_cuts[0] : feat_cuts[1]], 
        diabetes_X[:, feat_cuts[1] : feat_cuts[2]], diabetes_X[:, feat_cuts[2] : ]]

In [None]:
wine = datasets.load_wine()
wine_X = wine.data
wine_y = wine.target

In [None]:
feat_cuts = [int(wine_X.shape[1] / 4), int(wine_X.shape[1] / 2), int(wine_X.shape[1] * (3/4))]
data = [wine_X[:, : feat_cuts[0]], wine_X[:, feat_cuts[0] : feat_cuts[1]], 
        wine[:, feat_cuts[1] : feat_cuts[2]], wine[:, feat_cuts[2] : ]]

### Importing dna class

In [44]:
import dna_data_v5
from dna_data_v5 import DNA # To get this to work, needed to put 'if name == main' at bottom of dna.py 

In [45]:
importlib.reload(dna_data_v5)

<module 'dna_data_v5' from '/home/mjmrose/workspace/sbox-mjmrose/Bids/genAlgo/dna_data_v5.py'>

### Initializing a population

In [46]:
preproc_algos = []
models = ['rf', 'lr', 'xgb']
ens_methods = ['avg', 'lr', 'el_net']
tgt = diabetes_y
pop_sz = 10
eval_perc = 50
mating_pool_retain_perc = 10
mut_rate = 0.3
fit_exp = 2
pop = Population(data, preproc_algos, models, ens_methods, tgt, mut_rate, pop_sz, fit_exp, eval_perc, mating_pool_retain_perc, verbose=False)

self.genes['data']: [0, None, None, 3]
self.genes['models']: [None, 'lr', 'xgb']
self.genes['data']: [None, 1, 2, 3]
self.genes['models']: [None, 'lr', None]
self.genes['data']: [None, 1, 2, None]
self.genes['models']: ['rf', None, None]
self.genes['data']: [None, 1, 2, None]
self.genes['models']: ['rf', 'lr', 'xgb']
self.genes['data']: [0, None, None, 3]
self.genes['models']: ['rf', 'lr', 'xgb']
self.genes['data']: [None, 1, 2, 3]
self.genes['models']: [None, 'lr', 'xgb']
self.genes['data']: [0, 1, None, 3]
self.genes['models']: ['rf', 'lr', 'xgb']
self.genes['data']: [None, 1, 2, 3]
self.genes['models']: [None, 'lr', None]
self.genes['data']: [None, None, 2, 3]
self.genes['models']: ['rf', None, None]
self.genes['data']: [0, None, 2, 3]
self.genes['models']: ['rf', 'lr', 'xgb']


# Testing speed of convergence w/ gene selection based on fitness

In [47]:
start = time.time()
for i in range(2000):
    pop.evolve()
    if pop.is_finished():
        break
print(f"\nTime: {time.time() - start}")

Linear regression

R2 for test_preds: 0.29035729488090367
XGBoost
Performing randomized search
Best score: 0.356

R2 for test_preds: 0.23331235398505412
lr_ensembling

Score: 0.19639371253636528

Pre exp fitness: 0.19639371253636528
Post exp fitness: 0.03857049032381648
fitness sum: 0.03857049032381648
Linear regression

R2 for test_preds: 0.3997840862157056

Performing grid search
Best score: 0.510
	alpha: 0.2
	l1_ratio: 1

Score: 0.40594645958535824

Pre exp fitness: 0.40594645958535824
Post exp fitness: 0.1647925280498869
fitness sum: 0.20336301837370338
Random Forest
Performing randomized search
Best score: 0.376

R2 for test_preds: 0.32228326102754234

Performing grid search
Best score: 0.510
	alpha: 0.2
	l1_ratio: 1

Score: 0.3121895095378029

Pre exp fitness: 0.3121895095378029
Post exp fitness: 0.09746228986545392
fitness sum: 0.3008253082391573
Random Forest
Performing randomized search
Best score: 0.370

R2 for test_preds: 0.3528271456181842
Linear regression

R2 for test_pre

In [49]:
pop.best.genes['ens_model'].coef_

array([0.93337761])

In [None]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform
from sklearn.metrics import r2_score
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV

In [None]:
preds, ps, imps = train_xgb(diabetes_X, diabetes_y, diabetes_X, n_iter = 5)

In [None]:
def train_xgb(X_tr, y_tr, X_te, n_iter = 35, num_folds = 5):
    
    print("XGBoost")
    params = {'max_depth': sp_randint(1,5),
              'min_child_weight': sp_randint(1,35),
              'learning_rate': uniform(0.06,0.03),
              'reg_lambda': uniform(1,1),
              'subsample': uniform(0.8,0.2),
              'colsample_bytree':uniform(0.8,0.2)}

    rs = RandomizedSearchCV(estimator = xgb.XGBRegressor(),
        param_distributions=params, cv = num_folds, n_jobs = 24, n_iter = n_iter)

    print("\nPerforming randomized search")
    try: rs.fit(X_tr, y_tr)
    except Exception as e:
        print(e)
        pdb.set_trace()
    print("Best score: %0.3f" % rs.best_score_)
    best_parameters = rs.best_estimator_.get_params()
    preds = rs.predict(X_te)
    feat_imps = 
    
#     # Add features whose importance are 0
#     for i in range(x_train.shape[1]):
#         key = 'f' + str(i)
#         if key not in importances_raw.keys(): ## The importance of this feature is 0
#             importances_raw[key] = 0
    
    return preds, best_parameters, feat_imps