<a href="https://colab.research.google.com/github/lrssv/ComputacaoEvolutiva/blob/master/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model for classifying the Perpetrator's gender using an evolutionary algorithm to select features.

* ## Main

## Imports

In [31]:
import pandas as pd
import numpy as np
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pandas import read_csv 
from sklearn.metrics import mean_squared_error
from math import sqrt
import math
import pickle

## Scripts

### Original Dataset

In [2]:
def database(url):
  df_encoded = pd.read_csv(url)
  df_encoded = df_encoded.drop(columns=['Unnamed: 0'])

  return df_encoded

### Train and Test

In [3]:
def train_and_test(df):
  x = df[df.columns.difference(['Victim Sex'])]
  y = df[['Victim Sex']]
  
  x_train, x_test, y_train, y_test = np.array(train_test_split(x, y, test_size=0.2, stratify=y, random_state=42))

  x_train = np.array(x_train)
  x_test = np.array(x_test)
  y_train = np.ravel(y_train)
  y_test = np.array(y_test)

  return (x_train,x_test,y_train,y_test)

### Initial Population

In [4]:
def population(ind_size,pop_size,models):
  ind = []
  pop = []

  for p in range(pop_size):
    ind = [random.choice([True,False]) for i in range(ind_size)]
    ind[0] = random.choice(range(0,len(models)-1))
    pop.append(ind)
    ind = []

  return pop

### Fitness

In [5]:
def fitness(population,x_train,x_test,y_train,y_test,models):
  result = []

  for p in population:
    model = models[p[1]]
    ind = [i==1 for i in p[1:]] 
    if any(ind) == False:
      fit_value = math.inf  
    else: 
      model.fit(x_train[:,ind],y_train)
      y_pred = model.predict(x_test[:,ind])
      rmse = sqrt(mean_squared_error(y_test, y_pred))
      fit_value = 20*rmse + np.sum(p[1:])

    result.append([fit_value,p])

  return result

### Tournament Selection

In [6]:
def tour(fitness,tours):
  indiv_current = 1
  new_pop = []

  while indiv_current <= tours:
    indiv1 = random.choice(fitness) 
    indiv2 = random.choice(fitness)
    
    if indiv1[0] < indiv2[0]:
      new_pop.append(indiv1)
    else:
      new_pop.append(indiv2)
    
    indiv_current += 1

  return new_pop

### Mutation and Crossover

In [7]:
def mutation_and_crossover(gen):
  children = []
  prop_mutation = 0.1 #prob of mutation

  pop = [i[1] for i in gen]

  for p in gen:
    parent1 = random.choice(pop) 
    parent2 = random.choice(pop)

    rand = random.uniform(0,1) #random value to compare with prob_mutation
    n = random.choice(range(0,len(parent1))) #gene that gona be mutaded
    
    if rand < prop_mutation:
      #mutation
      parent1[0] = random.choice(range(4)) #mute the model
      
      if parent1[n] == True:
        parent1[n] = False 
        child = parent1
      else:
        parent1[n] = True 
        child = parent1
      
      children.append(child)
    
    else:
      #crossover
      chromosome_length = len(parent1) #length of chromosome
      crossover_point = random.randint(1,chromosome_length-1) #point of crossover
      child1 = parent1[1:crossover_point] + parent2[crossover_point:]
      child2 = parent2[1:crossover_point] + parent1[crossover_point:]

      child1.insert(0,parent2[0]) #model+child1
      child2.insert(0,parent1[0]) #model+child2

      #print(child1)
      children.append(child1)
      children.append(child2)
  
  return children      

### Survivors

In [8]:
def survivors(generation,population):
  count1,count2 = 0,0
  surv = []

  while count1 < 95:
    child1 = random.choice(generation) 
    child2 = random.choice(generation)

    if child1[1] < child2[1]:
      surv.append(child1)
    else:
      surv.append(child2)  
    
    count1 += 1

  while count2 < 5:
    parent1 = random.choice(population) 
    parent2 = random.choice(population)
      
    if parent1[1] < parent2[1]:
      surv.append(parent1)
    else:
      surv.append(parent2)
    count2 += 1
        
  return surv

### Evaluation

In [9]:
def evaluation(genetic_alg,models,x_train,x_test,y_train,y_test):
  indiv = [i[1] for i in genetic_alg]
  acc = [i[0] for i in genetic_alg]
    
  best_acc = min(acc) 
  acc_index = acc.index(best_acc)
  best_indiv = indiv[acc_index]

  model = models[best_indiv[0]]
  ind = [i==1 for i in best_indiv[1:]] 
  model.fit(x_train[:,ind],y_train)
  y_pred = model.predict(x_test[:,ind])
  accuracy = accuracy_score(y_test,y_pred)
  
  print('--------------- RESULTADOS --------------- ')
  print('Melhor individuo: {} \nFitness: {} \nAcurácia: {}'.format(best_indiv,best_acc,accuracy))

### Best individual in each population

In [20]:
def best(population):
  indiv = [i for i in population]
  fit = [i[0] for i in population]
      
  best_acc = min(fit) 
  acc_index = fit.index(best_acc)
  best_indiv = indiv[acc_index]

  print("Melhor individuo: {}".format(best_indiv))
  return best_indiv

##Setups

### Initial Conditions 

In [11]:
# Initial population with 100 individuals
indiv_size = 18
population_size = 100
models = [DecisionTreeClassifier(max_depth=10),DecisionTreeClassifier(max_depth=3),DecisionTreeClassifier(max_depth=14),
          RandomForestClassifier(max_depth=10),RandomForestClassifier(max_depth=14)]

In [12]:
# Original Data
df = database('/content/drive/My Drive/Data Files/df_model_per_encoded')

In [13]:
#Train and Test
x_train,x_test,y_train,y_test = train_and_test(df)

### Loop

In [17]:
def loop(int,tours,x_train,x_test,y_train,y_test,models,indiv_size,population_size):
  bests = []
  
  #Step 1
  first_population = population(indiv_size,population_size,models)
  print("ok1")

  #Step 2
  gen_fit = fitness(first_population,x_train,x_test,y_train,y_test,models)
  first_best_ind = best(gen_fit)
  bests.append(first_best_ind)
  print("ok2")

  for i in range(int):  
    #Step 3
    tourneament = tour(gen_fit,tours)
    print("ok3")

    #Step 4
    children = mutation_and_crossover(tourneament)
    print("ok4")
    
    #Step 5
    children_fitness = fitness(children,x_train,x_test,y_train,y_test,models)
    print("ok5")

    #Step 6
    survs = survivors(children_fitness,tourneament)
    print("ok6")

    #Step 7
    best_ind = best(survs)
    bests.append(best_ind) #bests indivuals in each generation
    print("ok7")
    
    gen_fit = survs

  return survs,bests

## Result of the Genetic Algorithm

In [15]:
loops = 100
tours = 200

In [None]:
genetic_alg = loop(loops,tours,x_train,x_test,y_train,y_test,models,indiv_size,population_size)

In [25]:
survives,bests_ind = genetic_alg 

In [26]:
pd.DataFrame(bests_ind)

Unnamed: 0,0,1
0,12.447710,"[3, False, False, False, False, False, False, ..."
1,12.459891,"[3, False, False, False, False, False, False, ..."
2,12.459681,"[0, False, False, False, False, False, False, ..."
3,11.469123,"[0, False, False, False, True, False, False, F..."
4,10.510946,"[3, False, False, False, False, False, False, ..."
...,...,...
96,8.510529,"[3, False, False, False, False, False, False, ..."
97,8.510529,"[0, False, False, False, False, False, False, ..."
98,8.510529,"[3, False, False, False, False, False, False, ..."
99,8.510529,"[0, False, False, False, False, False, False, ..."


In [27]:
print('--------------- MELHORES INDIVIDUOS --------------- ')
pd.DataFrame(survives,columns=['Acurácia','Modelo'])

--------------- MELHORES INDIVIDUOS --------------- 


Unnamed: 0,Acurácia,Modelo
0,8.510529,"[0, False, False, False, False, False, False, ..."
1,8.510529,"[0, False, False, False, False, False, False, ..."
2,8.510529,"[0, False, False, False, False, False, False, ..."
3,9.513031,"[0, False, False, False, False, False, False, ..."
4,8.510529,"[0, False, False, False, False, False, False, ..."
...,...,...
95,8.510529,"[0, False, False, False, False, False, False, ..."
96,8.510529,"[0, False, False, False, False, False, False, ..."
97,8.510529,"[0, False, False, False, False, False, False, ..."
98,8.510529,"[0, False, False, False, False, False, False, ..."


In [30]:
results = evaluation(survives,models,x_train,x_test,y_train,y_test) 

--------------- RESULTADOS --------------- 
Melhor individuo: [0, False, False, False, False, False, False, False, False, False, False, True, False, False, False, False, False, False] 
Fitness: 8.510529112672035 
Acurácia: 0.8591678348513208


In [32]:
#export population
with open("/content/drive/My Drive/Data Files/results.txt", "wb") as fp: 
  pickle.dump(genetic_alg, fp)