# Evolving Ensembles in Multi-objective GeneticProgramming for Classification with Unbalanced Data

### Dataset YEAST

Minority class = 'MIT'

Majority class = other tags

In [1]:
# Python 3.8.3
import deap # 1.3.1
from deap import gp, creator, base, tools, algorithms

import numpy as np # 1.18.5

import operator, random

import matplotlib.pyplot as plt

### Parameters

In [2]:
PNUM = 20 # Number of individuals in a population
GNUM = 10 # Number of generations
MTC = 0.2 # Mutation chance
CSC = 0.5 # Crossover chance

#### Helper functions for primitive operators

In [3]:

# Protected division
def div(x, y):
    return 1 if (y == 0) else x/y

# If then operator - changed the name from 'if_then' to 'lf' because of cxSemantic method
def lf(a, b, c):
    return b if (a < 0) else c


#### Importing dataset

In [4]:
filename = "./Datasets/yeast.data"

with open(filename) as f:
    content = f.read().splitlines()
    
data = [] # data for saving the features from file
names = [] # names of the protein classes
    
for line in content:
    tmp = line.split()
    tmp.pop(0) # pops the first element - we dont need it
    
    names.append(tmp.pop()) # pops the last element - name of protein
    
    data.append(np.asarray(tmp, float)) # extracting features to data as float values

#### Creating a primitive set for the classificator expression

In [5]:
pset = gp.PrimitiveSet("main", arity=8)

# Adding primitive operators
pset.addPrimitive(operator.add, 2)
pset.addPrimitive(operator.sub, 2)
pset.addPrimitive(operator.mul, 2)
pset.addPrimitive(div, 2) 
pset.addPrimitive(lf, 3, name="lf") # Added 'lf' as a name for the function


# Generate an expression where each leaf has the same depth between min and max.
expr = gp.genHalfAndHalf(pset, min_=1, max_=8) # Create expression of max depth 8 (from paper)
tree = gp.PrimitiveTree(expr) # Create tree classificator from expression

### Creating a fitness function and an individual (Generation of Tree Individuals)

In [6]:
creator.create("Fitness", base.Fitness, weights=(1.0, 1.0)) #"Fitness ili FitnessMin"

# Create an individual of type primitive tree and our fitness function, pset is a previously defined
creator.create("Individual", gp.PrimitiveTree, fitness=creator.Fitness, pset=pset)

### Helper function for evaluation

In [7]:
# Evaluation function
def evaluateAccuracy(individual, data, names, pset): # po predlosku
    
    #Compiling the expression
    classificator = gp.compile(individual, pset)
    
    minority_count = names.count("MIT")
    true_positive_count = 0
    
    majority_count = len(names) - minority_count
    true_negative_count = 0
    
    for (d, n) in zip(data, names):
        # feeding data to individual classificator
        result = classificator(*d)        
        #counting right answers
        if (n=="MIT") and (result >= 0): true_positive_count += 1
        elif (n!="MIT") and (result < 0): true_negative_count += 1
        
    # return accuracy on minority class and majority class
    return (true_positive_count/minority_count, true_negative_count/majority_count)


### Registering elements with toolbox

In [8]:
# Creating toolbox
toolbox = base.Toolbox()

toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=8) # genHalfAndHalf - Generate an expression with a PrimitiveSet pset. Half the time, the expression is generated with genGrow(), the other half, the expression is generated with genFull()
toolbox.register("individual", tools.initIterate, creator.Individual, toolbox.expr) # Register individual from expression

toolbox.register("population", tools.initRepeat, list, toolbox.individual)
#toolbox.register("population", tools.initRepeat, gp.PrimitiveTree, toolbox.individual)

toolbox.register('select', tools.selTournament, tournsize = 7)
# toolbox.register('select', tools.selSPEA2)

# Register mate function
toolbox.register('mate', gp.cxSemantic, pset=pset, min=1, max=8)

#Register mutate function
toolbox.register('mutate', gp.mutSemantic, pset=pset, min=1, max=8)

toolbox.register('evaluate', evaluateAccuracy, data=data, names=names, pset=pset)

#expr = toolbox.individual()
#nodes, edges, labels = gp.graph(expr)

### Graphviz Section ###
#import pygraphviz as pgv

#g = pgv.AGraph()
#g.add_nodes_from(nodes)
#g.add_edges_from(edges)
#g.layout(prog="dot")

#for i in nodes:
#    n = g.get_node(i)
#    n.attr["label"] = labels[i]

#g.draw("tree.pdf")

### Generating population

In [9]:
# toolbox.population(n=PNUM)

population = toolbox.population(n=PNUM)

### Evolving the population / geting hallOfFame

In [10]:
hof = tools.HallOfFame(10) #Hall of fame

stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)

algorithms.eaSimple(population, toolbox, cxpb=CSC, mutpb=MTC, ngen=GNUM, stats=stats, halloffame=hof)
# log = algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=7, stats=stats, halloffame=hof)

gen	nevals	avg     	std     	min	max
0  	20    	0.487767	0.438764	0  	1  
1  	11    	0.499012	0.473903	0  	1  
2  	13    	0.502522	0.485885	0  	1  
3  	16    	0.502489	0.486169	0  	1  
4  	12    	0.503432	0.486416	0  	1  
5  	12    	0.501306	0.498085	0  	1  
6  	13    	0.50129 	0.498711	0  	1  
7  	14    	0.501532	0.498468	0  	1  
8  	14    	0.501003	0.498387	0  	1  
9  	12    	0.501376	0.498025	0  	1  
10 	14    	0.501356	0.498165	0  	1  


([[<deap.gp.Primitive at 0x7fb508072770>,
   <deap.gp.Primitive at 0x7fb508072220>,
   <deap.gp.Primitive at 0x7fb508072770>,
   <deap.gp.Primitive at 0x7fb508072220>,
   <deap.gp.Primitive at 0x7fb508072770>,
   <deap.gp.Primitive at 0x7fb508072220>,
   <deap.gp.Primitive at 0x7fb508072770>,
   <deap.gp.Primitive at 0x7fb508072220>,
   <deap.gp.Primitive at 0x7fb508072770>,
   <deap.gp.Primitive at 0x7fb508072220>,
   <deap.gp.Primitive at 0x7fb508072770>,
   <deap.gp.Primitive at 0x7fb508072220>,
   <deap.gp.Primitive at 0x7fb508072770>,
   <deap.gp.Primitive at 0x7fb508072220>,
   <deap.gp.Primitive at 0x7fb508072cc0>,
   <deap.gp.Primitive at 0x7fb508072cc0>,
   <deap.gp.Terminal at 0x7fb508075a80>,
   <deap.gp.Terminal at 0x7fb50806ca40>,
   <deap.gp.Primitive at 0x7fb508072d10>,
   <deap.gp.Terminal at 0x7fb50408da00>,
   <deap.gp.Terminal at 0x7fb508075a80>,
   <deap.gp.Terminal at 0x7fb50806ca40>,
   <deap.gp.Primitive at 0x7fb508072d10>,
   <deap.gp.Primitive at 0x7fb508072770

#### print hof

In [11]:
#printaj hall of fame jedinke
i = 1;
for h in hof: #zaš su sve iste, jer nemamo crowding?
    print(i, h)
    print(h.fitness.values)
    print("\n")
    i += 1

#print(log)

1 add(sub(ARG4, mul(ARG2, lf(ARG4, ARG0, sub(ARG0, mul(ARG5, mul(ARG5, ARG7)))))), ARG5)
(1.0, 0.0032258064516129032)


2 add(sub(ARG4, mul(ARG2, lf(ARG4, ARG0, sub(ARG0, mul(ARG5, mul(ARG5, ARG7)))))), ARG5)
(1.0, 0.0032258064516129032)


3 add(sub(ARG4, mul(ARG2, lf(ARG4, ARG0, sub(ARG0, mul(ARG5, mul(ARG5, ARG7)))))), ARG5)
(1.0, 0.0032258064516129032)


4 add(sub(ARG4, mul(ARG2, lf(ARG4, ARG0, sub(ARG0, mul(ARG5, mul(ARG5, ARG7)))))), ARG5)
(1.0, 0.0032258064516129032)


5 add(sub(ARG4, mul(ARG2, lf(ARG4, ARG0, sub(ARG0, mul(ARG5, mul(ARG5, ARG7)))))), ARG5)
(1.0, 0.0032258064516129032)


6 add(sub(ARG4, mul(ARG2, lf(ARG4, ARG0, sub(ARG0, mul(ARG5, mul(ARG5, ARG7)))))), ARG5)
(1.0, 0.0032258064516129032)


7 add(sub(ARG4, mul(ARG2, lf(ARG4, ARG0, sub(ARG0, mul(ARG5, mul(ARG5, ARG7)))))), ARG5)
(1.0, 0.0032258064516129032)


8 add(sub(ARG4, mul(ARG2, lf(ARG4, ARG0, sub(ARG0, mul(ARG5, mul(ARG5, ARG7)))))), ARG5)
(1.0, 0.0032258064516129032)


9 add(sub(ARG4, mul(ARG2, lf(ARG4, ARG0,