In [512]:
# Se importa el set de datos y librerías necesarias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# datos de pasos con 34 atributos a dataframe
datasetFull = pd.read_csv('dataset_steps.csv')  # carga CSV a data frame 

## A class is defined for the cromosomes (individuals with certain features)

In [513]:
class Cromosoma:
    '''
    Clase objeto cromosoma, con las propiedades de: cromosoma,
    fitness, número de atributos, dataset del cromosoma.
    Método: fit() para evaluar y darle valor al fitness del cromosoma
    '''
    # Inicialización de atributos
    def __init__(self, cromosoma, dataset):
        self.cromosoma = cromosoma
        self.n_atributos = 0
        for bit in self.cromosoma:
            if bit == 1:
                self.n_atributos+=1
        self.fitness = 0
        
        # Crear dataset (self.data) con solo los atributos del cromosoma
        counter = 0
        atributos = []
        for bit in self.cromosoma:
            counter+=1
            if bit == 0:
                string = 'Atributo_{}'.format(counter) 
                atributos.append(string)
        self.data = dataset.drop(columns=atributos)
            
    # Calcula fitness del cromosoma utilizando algoritmo de clasificación
    def fit(self):         
        
        # separar atributos en un data frame y etiquetas en otro
        X = self.data.drop(columns='class')
        Y = self.data['class']
        
        # Crear partición de entrenamiento y prueba de 80/20
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=69)
        
        # Clasificar con RandomForest
        clf = RandomForestClassifier(n_estimators=10, max_features=None, bootstrap=False, 
                             random_state=69) # 10 árboles en el bosque, usando todos los atributos y todos los datos
        
        clf.fit(X_train, Y_train)  # construye algoritmo con datos de entrenamiento
        self.fitness = clf.score(X_test, Y_test) # Accuracy promedio en datos de test y etiquetas
        
    # Returns the array of features of the child of two individuals 
    def breed(self, mate_cromosome, mutation_rate):
        crossover_point = random.randint(0, len(self.cromosoma)-1)
        child = self.cromosoma[0:crossover_point] + mate_cromosome[crossover_point:]
            
        # mutations according to mutation rate
        mutations = int(mutation_rate*len(self.cromosoma)) # número de mutaciones según tasa
        for i in range(mutations):
                        gene = random.randint(0, len(self.cromosoma)-1)
                        child[gene] = int(not(child[gene])) # flip bit at gene position
        return child

#### Accuracy of base case: using all atributes

In [514]:
cromosome = [1]*(len(datasetFull.columns)-1)
foo = Cromosoma(cromosome, datasetFull)
foo.fit()
print("Accuracy using all features: {}".format(foo.fitness))

Accuracy using all features: 0.8677685950413223


### A function is created for the parent selection criteria (proportionate to fitness), using roulette wheel selection

In [515]:
# Function for selecting the parents, using "Roulette wheel selection"
def parent_select(population):
    fitness_tot = 0
    for individual in population:
        fitness_tot+= individual.fitness  # Total fitness
    
    previous_prob = 0
    threshold = []  # threshold to select parent
    for i in range(len(population)):
        prob = previous_prob + population[i].fitness/fitness_tot
        threshold.append(prob)  # The higher the prob, the better the threshold
        previous_prob = prob
        
    randno = random.random()
    index = 0
    # If random number is less than threshold, choose that parent
    while randno > threshold[index]:
        index+=1
    return population[index]

### Let's generating an initial population of _n_ cromosomes with random features

In [516]:
import random, math

n = 10 # individuos por generación
mutation_rate = 0.15  # mutation rate (from 0 to 1)
len_set = len(datasetFull.columns)-1 # total number of features in the dataset

poblacion = []
for i in range(n):
    cromo = []
    for j in range(0, len_set):
        cromo.append(random.randint(0, 1))
    poblacion.append(Cromosoma(cromo, datasetFull))

promedio = 0
for cromo in poblacion:
    #print("First gen individual: ", cromo.cromosoma)
    cromo.fit()
    promedio+=cromo.fitness
    
print("Average fitness of starting pop: {} ".format(promedio/len(poblacion)))

Average fitness of starting pop: 0.7834710743801654 


### Genetic Algorithm Implementation

In [517]:
mejor_fitness = 0
generacion = 0
current_gen = poblacion

# Stop conditions: Fitness better than 0.93 or 100 generations 
while (mejor_fitness<=0.93 and generacion<100): 
    generacion+=1
    for cromosoma in current_gen:
        cromosoma.fit() # Realizar clasificación por medio de RandomForest
    current_gen = sorted(current_gen, reverse=True, key=lambda x:x.fitness)  # Reordenar en orden de mejor fitness
    mejor_fitness=current_gen[0].fitness
    
    # Hijos para generar 90% de siguiente generación
    next_gen = []
    for i in range(int(0.9*n)):
        # Parents are chosen using the parent_select function
        parent1 = parent_select(current_gen)
        parent2 = parent_select(current_gen)
        child = parent1.breed(parent2.cromosoma, mutation_rate)
        next_gen.append(Cromosoma(child, datasetFull))
    
        #padre1 = random.choice(current_gen[:math.ceil(0.5*n)])  
        #padre2 = random.choice(current_gen[:math.ceil(0.5*n)])
        #hijo = padre1.breed(padre2.cromosoma,mutation_rate)
        #next_gen.append(Cromosoma(hijo, datasetFull))
        
    # Elitism to generate 10% of next population
    for i in range(math.ceil(0.1*n)):
        next_gen.append(current_gen[i])
    
    avg = 0
    for individual in current_gen:
        avg+=individual.fitness
    avg = avg/len(current_gen)
    print("Generación: {}".format(generacion))
    print("Best fitness: {} No. features: {}".format(current_gen[0].fitness,
                                                                current_gen[0].n_atributos))
    current_gen = next_gen

# Reordenar conjunto final
for cromosoma in current_gen:
    cromosoma.fit()
current_gen = sorted(current_gen, reverse=True, key=lambda x:x.fitness)  



Generación: 1
Best fitness: 0.8760330578512396 No. features: 19
Generación: 2
Best fitness: 0.8760330578512396 No. features: 19
Generación: 3
Best fitness: 0.8760330578512396 No. features: 16
Generación: 4
Best fitness: 0.8760330578512396 No. features: 16
Generación: 5
Best fitness: 0.8760330578512396 No. features: 16
Generación: 6
Best fitness: 0.8925619834710744 No. features: 14
Generación: 7
Best fitness: 0.8925619834710744 No. features: 14
Generación: 8
Best fitness: 0.9008264462809917 No. features: 18
Generación: 9
Best fitness: 0.9008264462809917 No. features: 18
Generación: 10
Best fitness: 0.9008264462809917 No. features: 18
Generación: 11
Best fitness: 0.9008264462809917 No. features: 18
Generación: 12
Best fitness: 0.9008264462809917 No. features: 18
Generación: 13
Best fitness: 0.9008264462809917 No. features: 18
Generación: 14
Best fitness: 0.9008264462809917 No. features: 18
Generación: 15
Best fitness: 0.9008264462809917 No. features: 18
Generación: 16
Best fitness: 0.900

### Print the final best individuals

In [519]:
for individuo in current_gen:
    individuo.fit()
    x = [str(bit) for bit in individuo.cromosoma]
    cromo = "".join(x)
    print("{}: Accuracy: {}".format(cromo, individuo.fitness))
    
    print("Número de atributos: {}".format(individuo.n_atributos))

current_gen[0].data.to_csv('best.csv', index=False)   
current_gen[0].data.head(5)

1101110000110101100010010000100011: Accuracy: 0.9173553719008265
Número de atributos: 15
1001100110110101100010010000100011: Accuracy: 0.8925619834710744
Número de atributos: 15
1001100000111100000111000101111000: Accuracy: 0.8842975206611571
Número de atributos: 15
1001101011100001100010101100010111: Accuracy: 0.8842975206611571
Número de atributos: 17
1101110001110101100011010000010111: Accuracy: 0.8760330578512396
Número de atributos: 18
0111110011001101010110100100011010: Accuracy: 0.8429752066115702
Número de atributos: 18
0001110000011100100111000100011011: Accuracy: 0.8347107438016529
Número de atributos: 15
0101100110110110010001000000011010: Accuracy: 0.8347107438016529
Número de atributos: 14
1101100110011110010000100100011011: Accuracy: 0.8181818181818182
Número de atributos: 17
1000101010010100011000100101010001: Accuracy: 0.768595041322314
Número de atributos: 13


Unnamed: 0,Atributo_1,Atributo_2,Atributo_4,Atributo_5,Atributo_6,Atributo_11,Atributo_12,Atributo_14,Atributo_16,Atributo_17,Atributo_21,Atributo_24,Atributo_29,Atributo_33,Atributo_34,class
0,3.43,2.68,2.4,2.65,5.12,0.08,4.88,1.99,1.03,0.82,1.47,3.61,3.62,3.43,2.74,clase1
1,4.38,3.66,2.35,2.77,3.69,1.33,4.76,2.7,1.35,1.46,0.99,3.76,3.95,3.22,2.5,clase1
2,4.48,3.55,1.79,2.6,3.24,2.85,4.55,3.12,1.8,1.38,1.97,3.07,3.29,3.13,3.45,clase1
3,3.74,4.06,1.53,2.23,3.37,2.29,4.44,3.69,2.39,0.3,2.39,2.35,3.86,3.09,4.62,clase1
4,4.42,3.01,2.37,2.85,3.33,2.3,5.67,2.46,1.5,1.22,1.22,3.54,3.44,3.48,2.87,clase1


In [520]:
poblacion = sorted(poblacion, reverse=True, key=lambda x:x.fitness)  # Reordenar en orden de mejor fitness
print("Starting top fitness: {}, Starting avg fitness: {}"
      .format(poblacion[0].fitness, promedio/len(poblacion)))
print("Fitness using all features: {}".format(foo.fitness))
print("Final top fitness: {} Final avg fitness: {}".format(current_gen[0].fitness, avg))

Starting top fitness: 0.8760330578512396, Starting avg fitness: 0.7834710743801654
Fitness using all features: 0.8677685950413223
Final top fitness: 0.9173553719008265 Final avg fitness: 0.815702479338843
