In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
>>> cd/content/drive/My Drive/LAB/dataset

/content/drive/My Drive/LAB/dataset


In [1]:
import pandas as pd
import random

In [33]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
dataset = pd.read_csv("diabetes.csv")

In [5]:
dataset

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [13]:

# features
data = dataset[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']]

# target
target = dataset[['Outcome']]

In [7]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(data, target, train_size = 0.75)

In [18]:
GENE_LENGTH = 8             # total number of features
MUTATION_RATE = 0.02         # 2% mutation rate

# mapping each feature to a integer in a dictionary
feature_dictionary = {
    0 : 'Pregnancies',
    1 : 'Glucose',
    2 : 'BloodPressure',
    3 : 'SkinThickness',
    4 : 'Insulin',
    5 : 'BMI',
    6 : 'DiabetesPedigreeFunction',
    7 : 'Age'
}

In [25]:
class Genome:
    def __init__(self, gene = None, generation = 0):
        self.gene = self.validate_gene(gene)
        self.generation = generation

    # check if the given gene is a valid gene
    def validate_gene(self, gene):
        if gene == None:
            return [random.randint(0, 1) for _ in range(GENE_LENGTH)]
        else:
            return gene

    # dump the gene info : "Gene / Generation"
    def dump(self):
        print(f"Gene: [{' '.join(str(i) for i in self.gene)}] / Generation: {self.generation}")

    # generate offspring by breeding two genomes
    def breed(self, other, mutation = False, randomized = False):
        baby_gene = [0 for _ in range(GENE_LENGTH)]

        # alternate between the parents to acquire genes
        if not randomized:
            for i in range(GENE_LENGTH):
                if i % 2 == 0:
                    baby_gene[i] = self.gene[i]
                else:
                    baby_gene[i] = other.gene[i]

        # randomly select parents for each allele
        else:
            for i in range(GENE_LENGTH):
                acquired_gene_chance = random.randint(0, 1)

                if acquired_gene_chance == 0:
                    baby_gene[i] = self.gene[i]

                else:
                    baby_gene[i] = other.gene[i]

        child = Genome(baby_gene, self.generation + 1)
        if mutation: child.mutate()

        return child

    def mutate(self):
        gene_copy = self.gene[:]
        for allele_index in range(GENE_LENGTH):
            mutation_chance = random.uniform(0, 1)

            if mutation_chance <= MUTATION_RATE:
                gene_copy[allele_index] = 0 if gene_copy[allele_index] == 1 else 1

        self.gene = gene_copy[:]


In [26]:
class Population:
    def __init__(self, pool_size, model_type, fitness_function):
        self.pool_size = pool_size
        self.model_type = model_type
        self.fitness_function = fitness_function
        self.pool = []

    # generate initial random population
    def generate_population(self):
        for _ in range(self.pool_size):
            self.pool.append(Genome())

    # find the fitness of each genome in the population
    # and select the best performing genomes for producing the
    # next generation of genomes
    def natural_selection(self):
        fitness_pool = []

        for (index, genome) in enumerate(self.pool):
            genome_model = self.model_type()

            # genome specific feature set extraction
            genome_train_data = xtrain[[feature_dictionary[index] for (index, allele) in enumerate(genome.gene) if allele == 1]]
            genome_test_data = xtest[[feature_dictionary[index] for (index, allele) in enumerate(genome.gene) if allele == 1]]
            
            # fit the genome model
            genome_model.fit(genome_train_data, ytrain)

            # find the fitness of the model using a fitness function
            genome_prediction = genome_model.predict(genome_test_data)
            genome_fitness = self.fitness_function(genome_prediction, ytest)

            fitness_pool.append((index, genome_fitness))

        # select the best performers
        fitness_pool.sort(key = lambda x: x[1])
        best_fitness = fitness_pool[0][1]
        fitness_pool = fitness_pool[10:]

        # fill the mating pool in a such a way that best performers are more 
        # likely to reproduce more, hence resulting in better genome set
        # for the next generation
        mating_pool = []
        for index, fitness in fitness_pool:
            for _ in range(round(fitness * 100)):
                mating_pool.append(self.pool[index])

        print(len(mating_pool))

        return mating_pool, best_fitness

    # simulate "total_generations: int" number of generations
    # and solve for the best feature set.
    def evolve(self, total_generations):

        for _ in range(total_generations):
            mating_pool, best_fitness = self.natural_selection()
            print(f"Generation: {mating_pool[0].generation}")
            print(f"Best Genome: {mating_pool[0].gene}")
            print(f"Best fitness: {best_fitness}")
            print("=========================================")
            next_pool = []

            for _ in range(self.pool_size):
                father = random.choice(mating_pool)
                mother = random.choice(mating_pool)

                child = father.breed(mother)
                next_pool.append(child)

            self.pool = next_pool

        print(mating_pool[0].gene)


In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score as accuracy

In [29]:
DTC_population = Population(
                pool_size = 20,
                model_type = DecisionTreeClassifier,
                fitness_function = accuracy
            )

DTC_population.generate_population()
DTC_population.evolve(10)

674
Generation: 0
Best Genome: [1, 0, 0, 0, 1, 1, 0, 1]
Best fitness: 0.5833333333333334
685
Generation: 1
Best Genome: [1, 1, 0, 1, 1, 0, 0, 0]
Best fitness: 0.6041666666666666
696
Generation: 2
Best Genome: [1, 1, 0, 0, 1, 0, 0, 1]
Best fitness: 0.6354166666666666
700
Generation: 3
Best Genome: [1, 1, 0, 1, 1, 0, 0, 0]
Best fitness: 0.65625
701
Generation: 4
Best Genome: [1, 1, 0, 0, 1, 0, 0, 1]
Best fitness: 0.6770833333333334
703
Generation: 5
Best Genome: [1, 1, 0, 0, 1, 0, 0, 1]
Best fitness: 0.6770833333333334
702
Generation: 6
Best Genome: [1, 1, 0, 0, 1, 0, 0, 1]
Best fitness: 0.671875
705
Generation: 7
Best Genome: [1, 1, 0, 0, 1, 0, 0, 1]
Best fitness: 0.6614583333333334
704
Generation: 8
Best Genome: [1, 1, 1, 0, 1, 0, 0, 1]
Best fitness: 0.671875
704
Generation: 9
Best Genome: [1, 1, 0, 0, 1, 0, 0, 1]
Best fitness: 0.6770833333333334
[1, 1, 0, 0, 1, 0, 0, 1]


In [31]:
from sklearn.linear_model import LogisticRegression

In [34]:
LR_population = Population(
                pool_size = 20,
                model_type = LogisticRegression,
                fitness_function = accuracy
            )

LR_population.generate_population()
LR_population.evolve(10)

734
Generation: 0
Best Genome: [1, 0, 1, 0, 1, 0, 1, 1]
Best fitness: 0.6041666666666666
773
Generation: 1
Best Genome: [0, 1, 1, 1, 0, 1, 0, 1]
Best fitness: 0.625
788
Generation: 2
Best Genome: [1, 1, 0, 1, 1, 0, 1, 0]
Best fitness: 0.75
790
Generation: 3
Best Genome: [1, 1, 0, 1, 1, 1, 1, 1]
Best fitness: 0.7760416666666666
790
Generation: 4
Best Genome: [1, 1, 0, 1, 1, 1, 1, 1]
Best fitness: 0.7864583333333334
790
Generation: 5
Best Genome: [1, 1, 0, 1, 1, 1, 1, 1]
Best fitness: 0.7864583333333334
790
Generation: 6
Best Genome: [1, 1, 0, 1, 1, 1, 1, 1]
Best fitness: 0.7864583333333334
790
Generation: 7
Best Genome: [1, 1, 0, 1, 1, 1, 1, 1]
Best fitness: 0.7864583333333334
790
Generation: 8
Best Genome: [1, 1, 0, 1, 1, 1, 1, 1]
Best fitness: 0.7864583333333334
790
Generation: 9
Best Genome: [1, 1, 0, 1, 1, 1, 1, 1]
Best fitness: 0.7864583333333334
[1, 1, 0, 1, 1, 1, 1, 1]


<h1>References</h1>

[My article on Genetic Algorithm](https://dev.to/kavinbharathi/genetic-algorithm-in-action-3ilj)