<a href="https://colab.research.google.com/github/lrssv/ComputacaoEvolutiva/blob/master/Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model for classifying the victim's gender using an evolutionary algorithm to select features.

* ## Main

## Imports

In [3]:
import pandas as pd
import numpy as np
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from pandas import read_csv 

## Scripts

### Original Dataset

In [10]:
def database(url):
  df_encoded = pd.read_csv(url)
  df_encoded = df_encoded.drop(columns=['Unnamed: 0'])

  print("--------------------------------- Dataset ---------------------------------")
  print(pd.DataFrame(df_encoded))

  return df_encoded

### Initial Population

In [24]:
def population(ind_size,pop_size,models):
  ind = []
  pop = []

  for p in range(pop_size):
    ind = [random.choice([True,False]) for i in range(ind_size)]
    ind[0] = random.choice(range(0,len(models)+1))
    pop.append(ind)
    ind = []

  print("-------------------------- Initial Population --------------------------")
  print(pd.DataFrame(pop, columns=['model', 'feature 1', 'feature 2', 'feature 3','feature 4',
                                   'feature 5', 'feature 6', 'feature 7','feature 8', 'feature 9',
                                   'feature 10', 'feature 11','feature 12', 'feature 13', 'feature 14',
                                   'feature 15','feature 16', 'feature 17']))
  return pop

### Fitness

In [30]:
def fitness(population,df,models):
  result = []

  x = df[df.columns.difference(['Victim Sex'])]
  y = df[['Victim Sex']]
  
  x_train, x_test, y_train, y_test = np.array(train_test_split(x, y, test_size=0.2, stratify=y, random_state=42))

  x_train = np.array(x_train)
  x_test = np.array(x_test)
  y_train = np.ravel(y_train)
  y_test = np.array(y_test)

  for p in population:
    model = models[p[1]]
    ind = [i==1 for i in p[1:]] 
    model.fit(x_train[:,ind],y_train)
    y_pred = model.predict(x_test[:,ind])
    accuracy = accuracy_score(y_test,y_pred)

    result.append([p,accuracy])
  
  print("----------------------- Initial Population -----------------------")
  print(pd.DataFrame(result, columns=['individual', 'fitness']))
  return result

### Tournament Selection

In [None]:
def tour(fitness):
  indiv_current = 1
  new_pop = []

  indiv = [i[0] for i in fitness]
  fit = [i[1] for i in fitness]

  while indiv_current <= 50:
    x = random.choice(range(len(fitness)))
    y = random.choice(range(len(fitness)))
    
    if fit[x] > fit[y]:
      new_pop.append(indiv[x])
    else:
      new_pop.append(indiv[y])
    
    indiv_current += 1

  return new_pop

### Mutation and Crossover

In [None]:
def mutation_and_crossover(population):
  children = []
  prop_mutation = 0.1 #prob of mutation
  rand = random.uniform(0,1) #random value to compare with prob_mutation


  for p in population:
    parent1 = random.choice(population) 
    parent2 = random.choice(population)
    n = random.choice(range(1,len(parent1))) #gene that gona be mutaded
    
    if rand < prop_mutation:
      #mutation
      parent1[0] = random.choice(range(4)) #mute the model
      
      if parent1[n] == True:
        parent1[n] = False 
        child = parent1
      else:
        parent1[n] = True 
        child = parent1
      
      children.append(child)
    
    else:
      #crossover
      chromosome_length = len(parent1) #length of chromosome
      crossover_point = random.randint(1,chromosome_length-1) #point of crossover
      child1 = parent1[1:crossover_point] + parent2[crossover_point:]
      child2 = parent2[1:crossover_point] + parent1[crossover_point:]

      child1.insert(0,parent2[0]) #model+child1
      child2.insert(0,parent1[0]) #model+child2
  
      children.append([child1,child2])
  
  return children      

##Setups

In [14]:
# All models options
models = [DecisionTreeClassifier(max_depth=10),DecisionTreeClassifier(max_depth=3),DecisionTreeClassifier(max_depth=14),
          RandomForestClassifier(max_depth=10)]

### Initial Conditions 

In [33]:
# Original Data
df_encoded = database('/content/drive/My Drive/Data Files/df_model_victim_encoded')

# Initial population with 100 individuals
indiv_size = 18
population_size = 100
initial_population = population(indiv_size,population_size,models)

--------------------------------- Dataset ---------------------------------
            City  Year     Month  ...    Weapon  Victim Count  Perpetrator Count
0       1.279586  1980  1.226100  ...  1.370346             0                  0
1       1.279586  1980  1.230114  ...  1.683970             0                  0
2       1.279586  1980  1.230114  ...  1.365691             0                  0
3       1.279586  1980  1.230194  ...  1.683970             0                  0
4       1.279586  1980  1.230194  ...  1.365691             0                  1
...          ...   ...       ...  ...       ...           ...                ...
638449  1.403846  2014  1.226100  ...  1.153664             0                  0
638450  1.393939  2014  1.219243  ...  1.153664             0                  0
638451  1.393939  2014  1.220578  ...  1.573841             0                  0
638452  1.277778  2014  1.223096  ...  1.267381             0                  1
638453  1.355263  2014  1.220578 

### Loop

In [31]:
#Fitness 
initial_fitness = fitness(initial_population,df_encoded,models)

----------------------- Initial Population -----------------------
                                           individual   fitness
0   [3, True, False, False, True, False, False, Fa...  0.780454
1   [1, True, True, False, True, False, False, Tru...  0.859168
2   [2, False, False, True, True, True, False, Tru...  0.773437
3   [1, True, False, True, True, False, False, Fal...  0.859168
4   [0, False, True, False, True, False, True, Fal...  0.864744
..                                                ...       ...
95  [0, True, True, True, True, False, False, Fals...  0.859168
96  [2, False, True, False, True, True, True, True...  0.858580
97  [4, True, True, False, True, False, True, Fals...  0.859168
98  [1, False, True, False, False, True, False, Fa...  0.779139
99  [3, True, True, True, False, True, False, Fals...  0.773986

[100 rows x 2 columns]


In [None]:
run = mutation_and_crossover(population)