In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
np.set_printoptions(suppress=True)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
# Patch Xeon Intel OneAPI Scikit accelerator
!pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
# Data
from sklearn.datasets import make_classification
X, y = make_classification(1000, 100, n_redundant=0, n_repeated=0, scale=0.01, flip_y=0.1)
#X.shape, y.shape

# Data Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, )
#X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Data Prepr
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
pip = make_pipeline(RobustScaler())
X_train_transf = pip.fit_transform(X_train)
X_test_transf = pip.transform(X_test)

In [None]:
from random import randint
from itertools import cycle
from time import monotonic
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from deap import creator as ga_cr, base as ga_b, algorithms as ga_algo, tools as ga_t

class GA_Scikit():
    def __init__(self, estimator, params, eval_func, eval_weights, X_train, 
                 X_test, y_train, y_test, score, sel_tournsize=2, cx_uniform_prob=0.5, 
                 mut_shuffle_idx_prob=0.1, n_pop=25, n_gen=10, n_hof=3, cx_prob=0.5, 
                 mut_prob=0.1, n_jobs=4):
        self.est = estimator
        self.params = params
        self.eval_func = eval_func
        self.eval_weights = eval_weights
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.score = score
        self.sel_tournsize = sel_tournsize
        self.cx_uniform_prob = cx_uniform_prob
        self.mut_shuffle_idx_prob = mut_shuffle_idx_prob
        self.n_pop = n_pop
        self.n_gen = n_gen
        self.n_hof = n_hof
        self.cx_prob = cx_prob
        self.mut_prob = mut_prob
        self.n_jobs = n_jobs
        
        self._pad_params()
        self._create_fitness_and_indiv()
        self._register_indiv_and_pop_generators()
        self._register_eval_func()
        self._register_selection_crossover_mutation_methods()
        
    def _pad_params(self):
        """Pad params for crossover shuffle idx method"""
        assert isinstance(self.params, dict), 'Params must be a dict, i.e. estimator.get_params()'
        params_count = {k: len(v) for k,v in self.params.items()}
        max_length, max_key = -99, ''
        for k, v in params_count.items():
            if v <= max_length:
                continue
            else:
                max_key = k
                max_length = v
        assert isinstance(max_length, int), 'The max length between all params must be an int'
        # cycle through params for max length param, otherwise infinite cycle
        values_padded = (cycle(v) if k!=max_key else v for k,v in self.params.items())
        values_padded = zip(*values_padded)  # ('a', 1, 14), ('b', 2, 16), ('c', 3, 16) ...
        values_padded = zip(*values_padded)  # ('a', 'b', 'c'), (1, 2, 3), (14, 15, 16)...
        padded_params = {}
        for k, v in zip(self.params, values_padded):
            padded_params[k] = v
        self.padded_params = padded_params
        
    def _create_fitness_and_indiv(self):
        """Create GA individual and fitness entities (classes)"""
        ga_cr.create('Fitness', ga_b.Fitness, weights=self.eval_weights)
        ga_cr.create('Individual', list, fitness=ga_cr.Fitness)

    def _gen_params_to_ga(self):
        """Generate index for each param for individual"""
        max_dict = len(self.padded_params)
        max_length = len(list(self.padded_params.values())[0])
        idxs = [randint(0, max_length-1) for _ in range(max_dict)]
        return idxs
            
    def _register_indiv_and_pop_generators(self):
        """Register GA individual and population generators"""
        self.tb = ga_b.Toolbox()

        if self.n_jobs > 1:
            from multiprocessing import Pool
            pool = Pool()
            self.tb.register("map", pool.map)

        self.tb.register("individual", ga_t.initIterate, ga_cr.Individual, self._gen_params_to_ga)
        self.tb.register("population", ga_t.initRepeat, list, self.tb.individual)
    
    def _register_eval_func(self):
        """Set GA evaluate individual function"""
        self.tb.register("evaluate",
                        self.eval_func,
                        padded_params=self.padded_params,
                        est=self.est,
                        X_train=self.X_train,
                        X_test=self.X_test, 
                        y_train=self.y_train, 
                        y_test=self.y_test,
                        score=self.score)
        
    def _register_selection_crossover_mutation_methods(self):
        self.tb.register("select", ga_t.selTournament, tournsize=self.sel_tournsize)
        self.tb.register("mate", ga_t.cxUniform, indpb=self.cx_uniform_prob)
        self.tb.register("mutate", ga_t.mutShuffleIndexes, indpb=self.mut_shuffle_idx_prob)
        
    def run_ga_search(self):
        pop = self.tb.population(n=self.n_pop)
        hof = ga_t.HallOfFame(self.n_hof)

        # Stats stdout
        stats1 = ga_t.Statistics(lambda ind: ind.fitness.values[0] )
        stats2 = ga_t.Statistics(lambda ind: ind.fitness.values[1] )
        stats3 = ga_t.Statistics(lambda ind: ind.fitness.values[2] )
        stats = ga_t.MultiStatistics(score=stats1, duration=stats2, risk=stats3)
        stats.register("avg", np.mean)
        #stats.register("std", np.std)
        #stats.register("min", np.min)
        #stats.register("max", np.max)

        # History
        #hist = tools.History()
        #toolbox.decorate("select", hist.decorator)
        #tb.decorate("mate", hist.decorator)
        #tb.decorate("mutate", hist.decorator)
        #hist.update(pop)

        # GA Run
        pop, log = ga_algo.eaSimple(pop, self.tb, cxpb=self.cx_prob, 
                                    mutpb=self.mut_prob, ngen=self.n_gen, 
                                    stats=stats, halloffame=hof, verbose=True)
        
        # Convert back params
        hof_ = {}
        for i in range(self.n_hof):
            hof_['hof_' + str(i)] = self._ga_to_params(hof[i])

        return pop, log, hof_
    
    def _ga_to_params(self, idx_params):
        res = {}
        for (k,v), idx in zip(self.padded_params.items(), idx_params):
            res[k] = v[idx]
        return res

In [None]:
# Estimator, params and requirements

rf = RandomForestClassifier()

rf_params = {
            'class_weight': ['balanced', 'balanced_subsample'],
            'bootstrap': [False, True],
            'n_estimators': np.linspace(1, 100, 100).astype(int),
            'max_depth': np.linspace(1, 100, 100).astype(int),
            'criterion': ['gini', 'entropy'],
            'max_features': np.linspace(.01, .99, 100),
            'max_samples': np.linspace(.01, .99, 100),
             }

def rf_eval_indiv(individual, padded_params, est, X_train, X_test, y_train, y_test, score):
    """Evaluate individual's genes (estimator's params)"""
    ### => Result tuple must match weights in GA
    indiv_params = {k : list(v)[idx] for (k,v), idx in zip(padded_params.items(), individual)}
    est.set_params(**{**indiv_params, **{'n_jobs': 1} })  # seems can't parallelize this simultaneous with GA
    est.fit(X_train, y_train)
    start = monotonic()
    pred = est.predict(X_test)
    obj2 = monotonic() - start
    obj1 = score(y_test, pred)
    pred_proba = est.predict_proba(X_test)
    obj3 = float(np.quantile(pred_proba.prod(axis=1), 0.5))
    return (obj1, obj2, obj3)
        
eval_weights = (1, -1, -1)
# sel_tournsize, cx_uniform_prob, mut_shuffle_idx_prob = 2, 0.5, 0.1
# n_pop, n_gen, n_hof = 5, 5, 3
# cx_prob, mut_prob = 0.5, 0.1

ga_params = GA_Scikit(rf, 
                  rf_params, 
                  rf_eval_indiv, 
                  eval_weights,
                  X_train_transf,
                  X_test_transf,
                  y_train,
                  y_test,
                  accuracy_score,
                  #sel_tournsize,
                  #cx_uniform_prob,
                  #mut_shuffle_idx_prob,
                  #n_pop,
                  #n_gen,
                  #n_hof,
                  #cx_prob,
                  #mut_prob
                  )
pop, log, hof = ga_params.run_ga_search()
hof

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

ad = AdaBoostClassifier()
#ad.get_params()

ad_params = {
    'algorithm': ['SAMME'],
     'base_estimator': [DecisionTreeClassifier(), LogisticRegression()],
     'learning_rate': np.linspace(0.001, 0.1, 100),
     'n_estimators': np.linspace(1, 100, 100).astype(int),
}

def ad_eval_indiv(individual, padded_params, est, X_train, X_test, y_train, y_test, score):
    """Evaluate individual's genes (estimator's params)"""
    ### => Result tuple must match weights in GA
    indiv_params = {k : list(v)[idx] for (k,v), idx in zip(padded_params.items(), individual)}
    #est.set_params(**{**indiv_params, **{'n_jobs': 1} })  # seems can't parallelize this simultaneous with GA
    est.set_params(**indiv_params)  # seems can't parallelize this simultaneous with GA
    est.fit(X_train, y_train)
    start = monotonic()
    pred = est.predict(X_test)
    obj2 = monotonic() - start
    obj1 = score(y_test, pred)
    pred_proba = est.predict_proba(X_test)
    obj3 = float(np.quantile(pred_proba.prod(axis=1), 0.5))
    return (obj1, obj2, obj3)
        
eval_weights = (1, -1, -1)
# sel_tournsize, cx_uniform_prob, mut_shuffle_idx_prob = 2, 0.5, 0.1
# n_pop, n_gen, n_hof = 5, 5, 3
# cx_prob, mut_prob = 0.5, 0.1

ga_params = GA_Scikit(ad, 
                  ad_params, 
                  ad_eval_indiv, 
                  eval_weights,
                  X_train_transf,
                  X_test_transf,
                  y_train,
                  y_test,
                  accuracy_score,
                  #sel_tournsize,
                  #cx_uniform_prob,
                  #mut_shuffle_idx_prob,
                  #n_pop,
                  #n_gen,
                  #n_hof,
                  #cx_prob,
                  #mut_prob
                  )
pop, log, hof = ga_params.run_ga_search()
hof

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

xt = ExtraTreesClassifier()
#xt.get_params()

xt_params = {
             'bootstrap': [False, True],
             'ccp_alpha': np.linspace(0, 0.1, 100),
             'class_weight': ['balanced', 'balanced_subsample'],
             'criterion': ['gini', 'entropy'],
             'max_depth': np.linspace(1, 100, 100).astype(int),
             'max_features': np.linspace(0.01, 0.99, 100),
             'max_leaf_nodes': np.linspace(1, 100, 100).astype(int),
             'max_samples': np.linspace(0.01, 0.99, 100),
             'min_impurity_decrease': np.linspace(0, 0.1, 100),
             'min_samples_leaf': np.linspace(1, 100, 100).astype(int),
             'min_samples_split': np.linspace(1, 100, 100).astype(int),
             'n_estimators': np.linspace(1, 100, 100).astype(int),
             }

def xt_eval_indiv(individual, padded_params, est, X_train, X_test, y_train, y_test, score):
    """Evaluate individual's genes (estimator's params)"""
    ### => Result tuple must match weights in GA
    indiv_params = {k : list(v)[idx] for (k,v), idx in zip(padded_params.items(), individual)}
    est.set_params(**{**indiv_params, **{'n_jobs': 1} })  # seems can't parallelize this simultaneous with GA
    est.fit(X_train, y_train)
    start = monotonic()
    pred = est.predict(X_test)
    obj2 = monotonic() - start
    obj1 = score(y_test, pred)
    pred_proba = est.predict_proba(X_test)
    obj3 = float(np.quantile(pred_proba.prod(axis=1), 0.5))
    return (obj1, obj2, obj3)
        
eval_weights = (1, -1, -1)
# sel_tournsize, cx_uniform_prob, mut_shuffle_idx_prob = 2, 0.5, 0.1
# n_pop, n_gen, n_hof = 5, 5, 3
# cx_prob, mut_prob = 0.5, 0.1

ga_params = GA_Scikit(xt, 
                  xt_params, 
                  xt_eval_indiv, 
                  eval_weights,
                  X_train_transf,
                  X_test_transf,
                  y_train,
                  y_test,
                  accuracy_score,
                  #sel_tournsize,
                  #cx_uniform_prob,
                  #mut_shuffle_idx_prob,
                  n_pop=50,
                  n_gen=15,
                  #n_hof,
                  #cx_prob,
                  #mut_prob
                  )
pop, log, hof = ga_params.run_ga_search()
hof

In [None]:
xt_params_best = {'bootstrap': False,
  'ccp_alpha': 0.030303030303030304,
  'class_weight': 'balanced_subsample',
  'criterion': 'entropy',
  'max_depth': 27,
  'max_features': 0.9504040404040404,
  'max_leaf_nodes': 65,
  'max_samples': 0.7722222222222221,
  'min_impurity_decrease': 0.012121212121212121,
  'min_samples_leaf': 15,
  'min_samples_split': 65,
  'n_estimators': 58}
xt.set_params(**xt_params_best)
xt.fit(X_train_transf, y_train)
xt.predict_proba(X_test_transf).prod(axis=1).mean()

In [None]:
rf_params_best = {'class_weight': 'balanced_subsample',
  'bootstrap': True,
  'n_estimators': 17,
  'max_depth': 4,
  'criterion': 'gini',
  'max_features': 0.6039393939393939,
  'max_samples': 0.8712121212121212}
rf.set_params(**rf_params_best)
rf.fit(X_train_transf, y_train)
rf.predict_proba(X_test_transf).prod(axis=1).mean()

In [None]:
# PREVIOUS WORK

In [None]:
# Long version, simple OneMax problem

# Class factory 
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# everything else is stored inside the toolbox
toolbox = base.Toolbox()
toolbox.register("attr_bool", randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, 100)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

def evalOneMax(individual):
    return sum(individual),

toolbox.register("evaluate", evalOneMax)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)

def main():
    pop = toolbox.population(n=300)
    
    fitnesses = list(map(toolbox.evaluate, pop))
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
        
    crossover_prob, mutate_prob = 0.5, 0.2
    
    fits = [ind.fitness.values[0] for ind in pop]
    
    g = 1
    while max(fits) < 100 and g < 1000:
        print(f"Gen {g:^3}:", end=' ')
        
        offspring = toolbox.select(pop, len(pop))
        offspring = list(map(toolbox.clone, offspring))

        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if random() < crossover_prob:
                toolbox.mate(child1, child2)
                del child1.fitness.values
                del child2.fitness.values

        for mutant in offspring:
            if random() < mutate_prob:
                toolbox.mutate(mutant)
                del mutant.fitness.values
                
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
            
        pop[:] = offspring
        
        fits = [ind.fitness.values[0] for ind in pop]
        
        length = len(pop)
        mean = sum(fits) / length
        sum2 = sum(x*x for x in fits)
        std = abs(sum2 / length - mean**2)**0.5
        
        print(f"Min {min(fits):.2f} |", end=' ')
        print(f"Max {max(fits):.2f} |", end=' ')
        print(f"Avg {mean:.2f} |", end=' ')
        print(f"Std {std:.2f}")
                
        g += 1
        
main()        

In [None]:
# Short version, simple OneMax problem and similar

creator.create("FitnessMax", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("attr_bool", randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, 1000)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

def evalOneMax(individual):
    """MSE-ish"""
    res = sum(individual)
    #if res < 0:
      #  return (500,)
    #if any((gene < 0 for gene in individual)):
      #  return (500,)
    #else:
    return sum([gene**2 for gene in individual])/len(individual),

toolbox.register("evaluate", evalOneMax)
toolbox.register("select", tools.selTournament, tournsize=2)
toolbox.register("mate", tools.cxUniform, indpb=0.5)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.1)

def main():
    pop = toolbox.population(n=500)
    hof = tools.HallOfFame(1)
    
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)
    
    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.01, ngen=200, 
                                   stats=stats, halloffame=hof, verbose=True)
    
    return pop, hof
    
#pop, hof = main()

In [None]:
creator.create('Fitness', base.Fitness, weights=(1,))
creator.create('Individual', list, fitness=creator.Fitness)

tb = base.Toolbox()
tb.register("attr_bool", randint, 0, 3)
tb.register("individual", tools.initRepeat, creator.Individual, tb.attr_bool, 5)
tb.register("population", tools.initRepeat, list, tb.individual)

tb.population(5)

In [None]:
# Some tests

from itertools import combinations

creator.create('Fitness', base.Fitness, weights=(1,))
creator.create('Individual', list, fitness=creator.Fitness)

def some_gen():
    nums = np.array(range(50))
    idx = np.random.randint(0, 50, size=5)
    return nums[idx]

tb = base.Toolbox()
tb.register("individual", tools.initIterate, creator.Individual, some_gen)
tb.register("population", tools.initRepeat, list, tb.individual)

pop = tb.population(5)

tb.register("mate", tools.cxUniform, indpb=0.20)
pop[0]
pop[1]
tb.mate(pop[0], pop[1])

tb.register("mutate", tools.mutShuffleIndexes, indpb=0.1)
tb.mutate(pop[0])

In [None]:
!pip install scikit-learn-intelex

In [None]:
# Patch Xeon Intel OneAPI Scikit accelerator
!pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
# Data
from sklearn.datasets import make_classification
X, y = make_classification(1000, 100, n_redundant=0, n_repeated=0, scale=0.01, flip_y=0.1)
#X.shape, y.shape

# Data Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, )
#X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Data Prepr
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
pip = make_pipeline(RobustScaler())
X_train_transf = pip.fit_transform(X_train)
X_test_transf = pip.transform(X_test)

In [None]:
# My DEAP Scikit implementation
# Trick Covnert params to n*n grid, feed idxs to GA
# TODO code class object
# TODO code cli possibly with click package and publish
# TODO ...
# Can be unique or multiobjective

from random import randint
from itertools import cycle, chain
from collections import defaultdict

# Estimator params
rf_params = {
            #'class_weight': ['balanced', 'balanced_subsample'],
            #'bootstrap': [False, True],
            'n_estimators': np.linspace(1,50).astype(int),
            'max_depth': np.linspace(1,50).astype(int),
             #'criterion': ['gini', 'entropy'],
             'max_features': np.linspace(.01, .99),
             'max_samples': np.linspace(.01, .99),
             }

# Pad estimator params
def _params_pad(params):
    """Pad params for crossover shuffle idx method"""
    params_count = {k: len(v) for k,v in params.items()}
    max_length, max_key = -99, ''
    for k, v in params_count.items():
        if v <= max_length:
            continue
        else:
            max_key = k
            max_length = v
    assert isinstance(max_length, int), 'The max length between all params must be an int'
    # cycle through params for max length param, otherwise infinite cycle
    values_padded = (cycle(v) if k!=max_key else v for k,v in params.items())
    values_padded = zip(*values_padded)  # ('a', 1, 14), ('b', 2, 16), ('c', 3, 16) ...
    values_padded = zip(*values_padded)  # ('a', 'b', 'c'), (1, 2, 3), (14, 15, 16)...
    padded_params = {}
    for k, v in zip(params, values_padded):
        padded_params[k] = v
    return padded_params
    
padded_params = _params_pad(rf_params)
#rf_params
#padded_params

# Params sampler for individual GA
def _gen_params(padded_params=padded_params):
    """Get some params idx for individual"""
    max_dict = len(padded_params)
    max_length = len(list(padded_params.values())[0])
    return [randint(0, max_length-1) for _ in range(max_dict)]

#some_params = _gen_params()
#some_params

from deap import creator, base, algorithms as ga_algo, tools

# GA Fitness
creator.create('Fitness', base.Fitness, weights=(0.75,))
creator.create('Individual', list, fitness=creator.Fitness)

# GA toolbox
tb = base.Toolbox()

# Multiprocessing for GA
### TODO code for n_jobs
from multiprocessing import Pool
pool = Pool()
tb.register("map", pool.map)

# GA Individual and Pop
tb.register("individual", tools.initIterate, creator.Individual, _gen_params)
#tb.individual()
tb.register("population", tools.initRepeat, list, tb.individual)
#tb.population(3)

# Specific fitness
def eval_indiv(individual, padded_params, est, X_train, X_test, y_train, y_test, score):
    """Evaluate individual's genes (estimator's params)"""
    # convert indiv genes to estimator params
    indiv_params = {k : list(v)[idx] for (k,v), idx in zip(padded_params.items(), individual)}
    ### TODO code for n_jobs
    est.set_params(**{**indiv_params, **{'n_jobs': -1} })
    est.fit(X_train, y_train)
    # ALWAYS return tuple
    obj1 = score(y_test, est.predict(X_test))
    obj2 = int(individual[0]) # n_estimators
    return (obj1,)
    
# Scikit Estimator
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
from sklearn.metrics import accuracy_score

# GA selection, crossover and mutation
tb.register("evaluate", 
            eval_indiv, 
            padded_params=padded_params, 
            est=rf, 
            X_train=X_train_transf, 
            X_test=X_test_transf, 
            y_train=y_train, 
            y_test=y_test, 
            score=accuracy_score)
tb.register("select", tools.selTournament, tournsize=2)
tb.register("mate", tools.cxUniform, indpb=0.5)
tb.register("mutate", tools.mutShuffleIndexes, indpb=0.1)

#_eval(tb.individual(), padded_params, rf, X_train_transf, X_test_transf, y_train, y_test, accuracy_score)

def main(tb=tb, tools=tools, ga_algo=ga_algo, n_pop=50, n_gen=50):
    pop = tb.population(n=n_pop)
    hof = tools.HallOfFame(2)
    
    # Stats stdout
    stats = tools.Statistics(lambda ind: ind.fitness.values )
    #stats2 = tools.Statistics(lambda ind: ind.fitness.values[1] )
    #stats = tools.MultiStatistics(acc=stats1, n_est=stats2)
    stats.register("avg", np.mean)
    #stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)
    
    # History
    #hist = tools.History()
    #toolbox.decorate("select", hist.decorator)
    #tb.decorate("mate", hist.decorator)
    #tb.decorate("mutate", hist.decorator)
    #hist.update(pop)
    
    # GA Run
    pop, log = ga_algo.eaSimple(pop, tb, cxpb=0.5, mutpb=0.1, ngen=n_gen, stats=stats, halloffame=hof, verbose=True)
    
    return pop, log, hof
    
pop, log, hof = main()

f'Best Params {hof[0]}'
f'Best Params 2nd {hof[1]}'

In [None]:
# Same but multi-objecgive
# Max accuracy score
# min number of trees

from random import randint
from itertools import cycle, chain
from collections import defaultdict

# Estimator params
rf_params = {
            #'class_weight': ['balanced', 'balanced_subsample'],
            #'bootstrap': [False, True],
            'n_estimators': np.linspace(1,50).astype(int),
            'max_depth': np.linspace(1,50).astype(int),
             #'criterion': ['gini', 'entropy'],
             'max_features': np.linspace(.01, .99),
             'max_samples': np.linspace(.01, .99),
             }

# Pad estimator params
def _params_pad(params):
    """Pad params for crossover shuffle idx method"""
    params_count = {k: len(v) for k,v in params.items()}
    max_length, max_key = -99, ''
    for k, v in params_count.items():
        if v <= max_length:
            continue
        else:
            max_key = k
            max_length = v
    assert isinstance(max_length, int), 'The max length between all params must be an int'
    # cycle through params for max length param, otherwise infinite cycle
    values_padded = (cycle(v) if k!=max_key else v for k,v in params.items())
    values_padded = zip(*values_padded)  # ('a', 1, 14), ('b', 2, 16), ('c', 3, 16) ...
    values_padded = zip(*values_padded)  # ('a', 'b', 'c'), (1, 2, 3), (14, 15, 16)...
    padded_params = {}
    for k, v in zip(params, values_padded):
        padded_params[k] = v
    return padded_params
    
padded_params = _params_pad(rf_params)
#rf_params
#padded_params

# Params sampler for individual GA
def _gen_params(padded_params=padded_params):
    """Get some params idx for individual"""
    max_dict = len(padded_params)
    max_length = len(list(padded_params.values())[0])
    return [randint(0, max_length-1) for _ in range(max_dict)]

#some_params = _gen_params()
#some_params

from deap import creator, base, algorithms as ga_algo, tools, cma

# GA Fitness
creator.create('Fitness', base.Fitness, weights=(1, -1))
creator.create('Individual', list, fitness=creator.Fitness)

# GA toolbox
tb = base.Toolbox()

# Multiprocessing for GA
### TODO code for n_jobs
from multiprocessing import Pool
pool = Pool()
tb.register("map", pool.map)

# GA Individual and Pop
tb.register("individual", tools.initIterate, creator.Individual, _gen_params)
#tb.individual()
tb.register("population", tools.initRepeat, list, tb.individual)
#tb.population(3)

# Specific fitness
def eval_indiv(individual, padded_params, est, X_train, X_test, y_train, y_test, score):
    """Evaluate individual's genes (estimator's params)"""
    # convert indiv genes to estimator params
    indiv_params = {k : list(v)[idx] for (k,v), idx in zip(padded_params.items(), individual)}
    ### TODO code for n_jobs
    est.set_params(**{**indiv_params, **{'n_jobs': -1} })
    est.fit(X_train, y_train)
    # ALWAYS return tuple
    obj1 = score(y_test, est.predict(X_test))
    obj2 = int(individual[0]) # n_estimators
    return (obj1, obj2)
    
# Scikit Estimator
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
from sklearn.metrics import accuracy_score

# GA selection, crossover and mutation
tb.register("evaluate", 
            eval_indiv, 
            padded_params=padded_params, 
            est=rf, 
            X_train=X_train_transf, 
            X_test=X_test_transf, 
            y_train=y_train, 
            y_test=y_test, 
            score=accuracy_score)
tb.register("select", tools.selTournament, tournsize=2)
tb.register("mate", tools.cxUniform, indpb=0.5)
tb.register("mutate", tools.mutShuffleIndexes, indpb=0.1)

#_eval(tb.individual(), padded_params, rf, X_train_transf, X_test_transf, y_train, y_test, accuracy_score)

def main(tb=tb, tools=tools, ga_algo=ga_algo, n_pop=50, n_gen=50):
    pop = tb.population(n=n_pop)
    hof = tools.HallOfFame(2)
    
    # Stats stdout
    stats1 = tools.Statistics(lambda ind: ind.fitness.values[0] )
    stats2 = tools.Statistics(lambda ind: ind.fitness.values[1] )
    stats = tools.MultiStatistics(acc=stats1, n_est=stats2)
    stats.register("avg", np.mean)
    #stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)
    
    # History
    #hist = tools.History()
    #toolbox.decorate("select", hist.decorator)
    #tb.decorate("mate", hist.decorator)
    #tb.decorate("mutate", hist.decorator)
    #hist.update(pop)
    
    # GA Run
    pop, log = ga_algo.eaSimple(pop, tb, cxpb=0.5, mutpb=0.1, ngen=n_gen, stats=stats, halloffame=hof, verbose=True)
    
    return pop, log, hof
    
pop, log, hof = main()

f'Best Params {hof[0]}'
f'Best Params 2nd {hof[1]}'

In [None]:
# Same but multi-objecgive
# Max accuracy score
# min number of trees
# Min risky predict probs

from random import randint
from itertools import cycle, chain
from collections import defaultdict

# Estimator params
rf_params = {
            #'class_weight': ['balanced', 'balanced_subsample'],
            #'bootstrap': [False, True],
            'n_estimators': np.linspace(1,50).astype(int),
            'max_depth': np.linspace(1,50).astype(int),
             #'criterion': ['gini', 'entropy'],
             'max_features': np.linspace(.01, .99),
             'max_samples': np.linspace(.01, .99),
             }

# Pad estimator params
def _params_pad(params):
    """Pad params for crossover shuffle idx method"""
    params_count = {k: len(v) for k,v in params.items()}
    max_length, max_key = -99, ''
    for k, v in params_count.items():
        if v <= max_length:
            continue
        else:
            max_key = k
            max_length = v
    assert isinstance(max_length, int), 'The max length between all params must be an int'
    # cycle through params for max length param, otherwise infinite cycle
    values_padded = (cycle(v) if k!=max_key else v for k,v in params.items())
    values_padded = zip(*values_padded)  # ('a', 1, 14), ('b', 2, 16), ('c', 3, 16) ...
    values_padded = zip(*values_padded)  # ('a', 'b', 'c'), (1, 2, 3), (14, 15, 16)...
    padded_params = {}
    for k, v in zip(params, values_padded):
        padded_params[k] = v
    return padded_params
    
padded_params = _params_pad(rf_params)
#rf_params
#padded_params

# Params sampler for individual GA
def _gen_params(padded_params=padded_params):
    """Get some params idx for individual"""
    max_dict = len(padded_params)
    max_length = len(list(padded_params.values())[0])
    return [randint(0, max_length-1) for _ in range(max_dict)]

#some_params = _gen_params()
#some_params

from deap import creator, base, algorithms as ga_algo, tools, cma

# GA Fitness
creator.create('Fitness', base.Fitness, weights=(1, -1, -1))
creator.create('Individual', list, fitness=creator.Fitness)

# GA toolbox
tb = base.Toolbox()

# Multiprocessing for GA
### TODO code for n_jobs
from multiprocessing import Pool
pool = Pool()
tb.register("map", pool.map)

# GA Individual and Pop
tb.register("individual", tools.initIterate, creator.Individual, _gen_params)
#tb.individual()
tb.register("population", tools.initRepeat, list, tb.individual)
#tb.population(3)

# Specific fitness
def eval_indiv(individual, padded_params, est, X_train, X_test, y_train, y_test, score):
    """Evaluate individual's genes (estimator's params)"""
    # convert indiv genes to estimator params
    indiv_params = {k : list(v)[idx] for (k,v), idx in zip(padded_params.items(), individual)}
    ### TODO code for n_jobs
    est.set_params(**{**indiv_params, **{'n_jobs': -1} })
    est.fit(X_train, y_train)
    # ALWAYS return tuple
    obj1 = score(y_test, est.predict(X_test))
    obj2 = int(individual[0]) # n_estimators
    obj3 = float(np.quantile(est.predict_proba(X_test).prod(axis=1), 0.5))
    return (obj1, obj2, obj3)
    
# Scikit Estimator
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
from sklearn.metrics import accuracy_score

# GA selection, crossover and mutation
tb.register("evaluate",
            eval_indiv,
            padded_params=padded_params,
            est=rf,
            X_train=X_train_transf,
            X_test=X_test_transf, 
            y_train=y_train, 
            y_test=y_test, 
            score=accuracy_score)
tb.register("select", tools.selTournament, tournsize=2)
tb.register("mate", tools.cxUniform, indpb=0.5)
tb.register("mutate", tools.mutShuffleIndexes, indpb=0.1)

#_eval(tb.individual(), padded_params, rf, X_train_transf, X_test_transf, y_train, y_test, accuracy_score)

def main(tb=tb, tools=tools, ga_algo=ga_algo, n_pop=50, n_gen=50):
    pop = tb.population(n=n_pop)
    hof = tools.HallOfFame(2)
    
    # Stats stdout
    stats1 = tools.Statistics(lambda ind: ind.fitness.values[0] )
    stats2 = tools.Statistics(lambda ind: ind.fitness.values[1] )
    stats3 = tools.Statistics(lambda ind: ind.fitness.values[2] )
    stats = tools.MultiStatistics(acc=stats1, n_est=stats2, risk=stats3)
    stats.register("avg", np.mean)
    #stats.register("std", np.std)
    #stats.register("min", np.min)
    #stats.register("max", np.max)
    
    # History
    #hist = tools.History()
    #toolbox.decorate("select", hist.decorator)
    #tb.decorate("mate", hist.decorator)
    #tb.decorate("mutate", hist.decorator)
    #hist.update(pop)
    
    # GA Run
    pop, log = ga_algo.eaSimple(pop, tb, cxpb=0.5, mutpb=0.1, ngen=n_gen, stats=stats, halloffame=hof, verbose=True)
    
    return pop, log, hof
    
pop, log, hof = main()

f'Best Params {hof[0]}'
f'Best Params 2nd {hof[1]}'