# Algoritmos genéticos para otimização de hiper parâmetros

Logistic Regression com as funções escritas passo à passo

In [1]:

import numpy as np
import pandas as pd
import random
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("diabetes.csv")
limit = 3

z_scores = stats.zscore(df['Pregnancies'])
outliers_p = df['Pregnancies'][abs(z_scores) > limit]

z_scores = stats.zscore(df['Glucose'])
outliers_g = df['Glucose'][abs(z_scores) > limit]

z_scores = stats.zscore(df['BloodPressure'])
outliers_b = df['BloodPressure'][abs(z_scores) > limit]


z_scores = stats.zscore(df['BMI'])
outliers_bmi = df['BMI'][abs(z_scores) > limit]

z_scores = stats.zscore(df['DiabetesPedigreeFunction'])
outliers_dpf = df['DiabetesPedigreeFunction'][abs(z_scores) > limit]

z_scores = stats.zscore(df['Age'])
outliers_a = df['Age'][abs(z_scores) > limit]

unique_outliers = list(set(outliers_g.index) | set(outliers_b.index) | set([9,49,60,81,145,371,426,494,522,684,706]))
df.drop(unique_outliers, inplace=True)

X = df.drop("Outcome", axis=1)
y = df["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_scaled = scaler.transform(X)

In [3]:

def fitness_function(individual):
    C = individual[0]
    penalty = ['l1', 'l2', 'elasticnet', None][individual[1]]
    solver = ['liblinear', 'lbfgs', 'saga'][individual[2]]
    max_iter = individual[3]

    if C <= 0:
        return 0

    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        return 0
    if penalty == 'elasticnet' and solver != 'saga':
        return 0
    if penalty is None and solver == 'liblinear':
        return 0

    try:
        model = LogisticRegression(
            C=C if penalty is not None else 1.0,
            penalty=penalty,
            solver=solver,
            max_iter=max_iter,
            l1_ratio=0.5 if penalty == 'elasticnet' else None
        )
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        return recall_score(y_test, y_pred)
    except Exception as error:
        print(error)
        return 0


In [4]:

def generate_population(size):
    population = []
    for _ in range(size):
        C = 10 ** np.random.uniform(-3, 2)
        penalty = random.randint(0, 3)
        solver = random.randint(0, 2)
        max_iter = random.randint(100, 500)
        population.append([C, penalty, solver, max_iter])
    return population


In [5]:

def selection(population, fitness_scores, num_parents):
    parents = []
    for _ in range(num_parents):
        i1, i2 = random.sample(range(len(population)), 2)
        parents.append(population[i1] if fitness_scores[i1] > fitness_scores[i2] else population[i2])
    return parents


In [6]:

def crossover(parents, offspring_size):
    offspring = []
    for _ in range(offspring_size):
        p1, p2 = random.sample(parents, 2)
        cp = random.randint(1, len(p1) - 1)
        offspring.append(p1[:cp] + p2[cp:])
    return offspring


In [7]:

def mutation(offspring, rate=0.1):
    for ind in offspring:
        if random.random() < rate:
            idx = random.randint(0, len(ind) - 1)
            if idx == 0:
                ind[0] = 10 ** random.uniform(-3, 2)
            elif idx == 1:
                ind[1] = random.randint(0, 3)
            elif idx == 2:
                ind[2] = random.randint(0, 2)
            elif idx == 3:
                ind[3] = random.randint(100, 500)
    return offspring


In [8]:

def genetic_algorithm(
    generations=20,
    population_size=20,
    num_parents=2
):
    population = generate_population(population_size)
    best_individual, best_score = None, 0

    for g in range(generations):
        fitness_scores = [fitness_function(ind) for ind in population]

        for ind, score in zip(population, fitness_scores):
            if score > best_score:
                best_score = score
                best_individual = ind

        parents = selection(population, fitness_scores, num_parents)
        offspring = crossover(parents, population_size - num_parents)
        # offspring = mutation(offspring) #0.7532
        offspring = mutation(offspring, rate=0.8)
        population = parents + offspring

        print(f"Generation {g+1} | Best Recall: {best_score:.4f}")

    return best_individual, best_score


In [9]:

best_individual, best_score = genetic_algorithm()

penalties = ['l1', 'l2', 'elasticnet', 'none']
solvers = ['liblinear', 'lbfgs', 'saga']

print("\nBest Individual Found:")
print(f"C: {best_individual[0]:.4f}")
print(f"Penalty: {penalties[best_individual[1]]}")
print(f"Solver: {solvers[best_individual[2]]}")
print(f"Max Iter: {best_individual[3]}")
print(f"Validation Recall: {best_score:.4f}")


Generation 1 | Best Recall: 0.6000
Generation 2 | Best Recall: 0.6000
Generation 3 | Best Recall: 0.6000
Generation 4 | Best Recall: 0.6000
Generation 5 | Best Recall: 0.6000
Generation 6 | Best Recall: 0.7600
Generation 7 | Best Recall: 0.7600
Generation 8 | Best Recall: 0.7600
Generation 9 | Best Recall: 0.7600
Generation 10 | Best Recall: 0.7600
Generation 11 | Best Recall: 0.7600
Generation 12 | Best Recall: 0.7600
Generation 13 | Best Recall: 0.7600
Generation 14 | Best Recall: 0.7600
Generation 15 | Best Recall: 0.7600
Generation 16 | Best Recall: 0.7600
Generation 17 | Best Recall: 0.7600
Generation 18 | Best Recall: 0.7600
Generation 19 | Best Recall: 0.7600
Generation 20 | Best Recall: 0.7600

Best Individual Found:
C: 0.0082
Penalty: l1
Solver: liblinear
Max Iter: 185
Validation Recall: 0.7600
